From 97eb5a3ae0faa80b6d14bb3b75226a4f0aef12cb Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Sun, 27 Dec 2015 19:35:55 +0100 Subject: [PATCH] First commit --- .gitignore | 1 + .gitmodules | 6 + libbmc/__init__.py | 0 libbmc/doi.py | 113 +++++++++++++ libbmc/external/opendetex | 1 + libbmc/external/poppler | 1 + libbmc/isbn.py | 55 ++++++ libbmc/repositories/arxiv.py | 316 +++++++++++++++++++++++++++++++++++ libbmc/tools.py | 37 ++++ requirements.txt | 2 + 10 files changed, 532 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 libbmc/__init__.py create mode 100644 libbmc/doi.py create mode 160000 libbmc/external/opendetex create mode 160000 libbmc/external/poppler create mode 100644 libbmc/isbn.py create mode 100644 libbmc/repositories/arxiv.py create mode 100644 libbmc/tools.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4603d9f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "libbmc/external/opendetex"] + path = libbmc/external/opendetex + url = https://github.com/Phyks/opendetex +[submodule "libbmc/external/poppler"] + path = libbmc/external/poppler + url = git://git.freedesktop.org/git/poppler/poppler diff --git a/libbmc/__init__.py b/libbmc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libbmc/doi.py b/libbmc/doi.py new file mode 100644 index 0000000..70860df --- /dev/null +++ b/libbmc/doi.py @@ -0,0 +1,113 @@ +""" +This file contains all the DOI-related functions. +""" +import re +import requests + +from libbmc import tools + +# Taken from +# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802 +regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b", + re.IGNORECASE) +# Base dx.doi.org URL for redirections +dx_url = "http://dx.doi.org/{doi}" + + +def is_valid(doi): + """ + Check that a given DOI is a valid canonical DOI. + + :param doi: The DOI to be checked. + :returns: Boolean indicating whether the DOI is valid or not. + """ + match = regex.match(doi) + return ((match is not None) and (match.group(0) == doi)) + + +def extract_from_text(text): + """ + Extract canonical DOIs from a text. + + :param text: The text to extract DOIs from. + :returns: A list of found DOIs. + """ + return tools.remove_duplicates(regex.findall(text)) + + +def to_URL(dois): + """ + Convert a list of canonical DOIs to a list of DOIs URLs. + + :param dois: List of canonical DOIs. + :returns: A list of DOIs URLs. + """ + if isinstance(dois, list): + return [dx_url.format(doi=doi) for doi in dois] + else: + return dx_url.format(doi=dois) + + +def to_canonical(urls): + """ + Convert a list of DOIs URLs to a list of canonical DOIs. + + :param dois: A list of DOIs URLs. + :returns: List of canonical DOIs. + """ + if isinstance(urls, list): + return [extract_from_text(url) for url in urls] + else: + return extract_from_text(urls) + + +def get_oa_version(doi): + """ + Get an OA version for a given DOI. + + .. note:: + + Uses beta.dissem.in API. + + :param doi: A canonical DOI. + :returns: The URL of the OA version of the given DOI, or ``None``. + """ + # If DOI is a link, truncate it + r = requests.get("http://beta.dissem.in/api/%s" % (doi,)) + try: + assert(r.status_code == requests.codes.ok) + result = r.json() + assert(result["status"] == "ok") + return result["paper"]["pdf_url"] + except (AssertionError, ValueError, KeyError): + return None + + +def get_linked_version(doi): + """ + Get the original link behind the DOI. + + :param doi: A canonical DOI. + :returns: The canonical URL behind the DOI, or ``None``. + """ + r = requests.head(to_URL(doi)) + return r.headers.get("location") + + +def get_bibtex(doi): + """ + Get a BibTeX entry for a given DOI. + + .. note:: + + Adapted from https://gist.github.com/jrsmith3/5513926. + + :param doi: The canonical DOI to get BibTeX from. + :returns: A BibTeX string or ``None``. + """ + r = requests.get(to_URL(doi), + headers={"accept": "application/x-bibtex"}) + if r.headers.get("content-type") == "application/x-bibtex": + return r.text + else: + return None diff --git a/libbmc/external/opendetex b/libbmc/external/opendetex new file mode 160000 index 0000000..b980e67 --- /dev/null +++ b/libbmc/external/opendetex @@ -0,0 +1 @@ +Subproject commit b980e6764279df32acd0f693a163d2040f1166b7 diff --git a/libbmc/external/poppler b/libbmc/external/poppler new file mode 160000 index 0000000..b3425dd --- /dev/null +++ b/libbmc/external/poppler @@ -0,0 +1 @@ +Subproject commit b3425dd3261679958cd56c0f71995c15d2124433 diff --git a/libbmc/isbn.py b/libbmc/isbn.py new file mode 100644 index 0000000..ce56322 --- /dev/null +++ b/libbmc/isbn.py @@ -0,0 +1,55 @@ +""" +This file contains all the ISBN-related functions. +""" +import isbnlib + +from libbmc import doi + + +def is_valid(isbn): + """ + Check that a given string is a valid ISBN. + + :param isbn: the isbn to be checked. + :returns: boolean indicating whether the isbn is valid or not. + + """ + return not isbnlib.notisbn(isbn) + + +def extract_from_text(text): + """ + Extract ISBNs from a text. + + :param text: Some text. + :returns: A list of canonical ISBNs found in the text. + """ + return [isbnlib.get_canonical_isbn(isbn) + for isbn in isbnlib.get_isbnlike(text)] + + +def get_bibtex(isbn): + """ + Get a BibTeX string for the given ISBN. + + :param isbn: ISBN to fetch BibTeX entry for. + :returns: A BibTeX string. + """ + return doi.get_bibtex(to_doi(isbn)) + + +def to_doi(isbn): + """ + Try to fetch a DOI from a given ISBN. + + :param isbn: A valid ISBN string. + :returns: A DOI as string. + """ + return isbnlib.doi(isbn) + + +def from_doi(doi): + """ + TODO + """ + assert(False) diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py new file mode 100644 index 0000000..0b6525c --- /dev/null +++ b/libbmc/repositories/arxiv.py @@ -0,0 +1,316 @@ +""" +This file contains all the arXiv-related functions. +""" +import io +import re +import requests +import tarfile +import xml.etree.ElementTree + +from libbmc import tools + + +arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?" +arxiv_identifier_before_2007 = r"(" + ("|".join([ + "astro-ph.GA", + "astro-ph.CO", + "astro-ph.EP", + "astro-ph.HE", + "astro-ph.IM", + "astro-ph.SR", + "cond-math.dis-nn", + "cond-math.mtrl-sci", + "cond-math.mes-hall", + "cond-math.other", + "cond-math.quant-gas", + "cond-math.soft", + "cond-math.stat-mech", + "cond-math.str-el", + "cond-math.supr-con", + "gr-qc", + "hep-ex", + "hep-lat", + "hep-ph", + "hep-th", + "math-ph", + "nlin.AO", + "nlin.CG", + "nlin.CD", + "nlin.SI", + "nlin.PS", + "nucl-ex", + "nucl-th", + "physics.acc-ph", + "physics.ao-ph", + "physics.atom-ph", + "physics.atm-clus", + "physics.bio-ph", + "physics.chem-ph", + "physics.class-ph", + "physics.comp-ph", + "physics.data-an", + "physics.flu-dyn", + "physics.gen-ph", + "physics.geo-ph", + "physics.hist-ph", + "physics.ins-det", + "physics.med-ph", + "physics.optics", + "physics.ed-ph", + "physics.soc-ph", + "physics.plasm-ph", + "physics.pop-ph", + "physics.space-ph", + "physics.quant-ph", + "math.AG", + "math.AT", + "math.AP", + "math.CT", + "math.CA", + "math.CO", + "math.AC", + "math.CV", + "math.DG", + "math.DS", + "math.FA", + "math.GM", + "math.GN", + "math.GT", + "math.GR", + "math.HO", + "math.IT", + "math.KT", + "math.LO", + "math.MP", + "math.MG", + "math.NT", + "math.NA", + "math.OA", + "math.OC", + "math.PR", + "math.QA", + "math.RT", + "math.RA", + "math.SP", + "math.ST", + "math.SG", + "cs.AI", + "cs.CL", + "cs.CC", + "cs.CE", + "cs.CG", + "cs.GT", + "cs.CV", + "cs.CY", + "cs.CR", + "cs.DS", + "cs.DB", + "cs.DL", + "cs.DM", + "cs.DC", + "cs.ET", + "cs.FL", + "cs.GL", + "cs.GR", + "cs.AR", + "cs.HC", + "cs.IR", + "cs.IT", + "cs.LG", + "cs.LO", + "cs.MS", + "cs.MA", + "cs.MM", + "cs.NI", + "cs.NE", + "cs.NA", + "cs.OS", + "cs.OH", + "cs.PF", + "cs.PL", + "cs.RO", + "cs.SI", + "cs.SE", + "cs.SD", + "cs.SC", + "cs.SY", + "q-bio.BM", + "q-bio.CB", + "q-bio.GN", + "q-bio.MN", + "q-bio.NC", + "q-bio.OT", + "q-bio.PE", + "q-bio.QM", + "q-bio.SC", + "q-bio.TO", + "q-fin.CP", + "q-fin.EC", + "q-fin.GN", + "q-fin.MF", + "q-fin.PM", + "q-fin.PR", + "q-fin.RM", + "q-fin.ST", + "q-fin.TR", + "stat.AP", + "stat.CO", + "stat.ML", + "stat.ME", + "stat.OT", + "stat.TH"])) + r")/\d+" +regex = re.compile( + "(" + arxiv_identifier_from_2007 + ")|(" + + arxiv_identifier_before_2007 + ")", + re.IGNORECASE) + +# Base arXiv URL used as id sometimes +arxiv_url = "http://arxiv.org/abs/{arxiv_id}" +# Eprint URL used to download sources +arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}" + + +def is_valid(arxiv_id): + """ + Check that a given arXiv ID is a valid one. + + :param arxiv_id: The arXiv ID to be checked. + :returns: Boolean indicating whether the arXiv ID is valid or not. + """ + match = regex.match(arxiv_id) + return ((match is not None) and (match.group(0) == arxiv_id)) + + +def get_bibtex(arxiv_id): + """ + TODO + """ + assert(False) + + +def extract_from_text(text): + """ + Extract arXiv IDs from a text. + + :param text: The text to extract arXiv IDs from. + :returns: A list of matching arXiv IDs. + """ + return tools.remove_duplicates(regex.findall(text)) + + +def to_URL(arxiv_ids): + """ + Convert a list of canonical DOIs to a list of DOIs URLs. + + :param dois: List of canonical DOIs. + :returns: A list of DOIs URLs. + """ + if isinstance(arxiv_ids, list): + return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids] + else: + return arxiv_url.format(arxiv_id=arxiv_ids) + + +def to_canonical(urls): + """ + Convert a list of DOIs URLs to a list of canonical DOIs. + + :param dois: A list of DOIs URLs. + :returns: List of canonical DOIs. + """ + if isinstance(urls, list): + return [extract_from_text(url) for url in urls] + else: + return extract_from_text(urls) + + +def from_doi(doi): + """ + Get the arXiv eprint id for a given DOI. + + .. note:: + + Uses arXiv API. Will not return anything if arXiv is not aware of the + associated DOI. + + :param doi: The DOI of the resource to look for. + :returns: The arXiv eprint id, or ``None`` if not found. + """ + r = requests.get("http://export.arxiv.org/api/query", + params={ + "search_query": "doi:%s" % (doi,), + "max_results": 1 + }) + e = xml.etree.ElementTree.fromstring(r.content) + for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): + id = entry.find("{http://www.w3.org/2005/Atom}id").text + # id is an arXiv full URL. We only want the id which is the last URL + # component. + return id.split("/")[-1] + return None + + +def to_doi(arxiv_id): + """ + Get the associated DOI for a given arXiv eprint. + + .. note:: + + Uses arXiv API. Will not return anything if arXiv is not aware of the + associated DOI. + + :param eprint: The arXiv eprint id. + :returns: The DOI if any, or ``None``. + """ + r = requests.get("http://export.arxiv.org/api/query", + params={ + "id_list": arxiv_id, + "max_results": 1 + }) + e = xml.etree.ElementTree.fromstring(r.content) + for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): + doi = entry.find("{http://arxiv.org/schemas/atom}doi") + if doi is not None: + return doi.text + return None + + +def get_sources(arxiv_id): + """ + Download sources on arXiv for a given preprint. + + :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \ + canonical form. + :returns: A ``TarFile`` object of the sources of the arXiv preprint or \ + ``None``. + """ + r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id)) + try: + assert(r.status_code == requests.codes.ok) + file_object = io.BytesIO(r.content) + return tarfile.open(fileobj=file_object) + except (AssertionError, tarfile.TarError): + return None + + +def get_bbl(arxiv_id): + """ + Get the .bbl files (if any) of a given preprint. + + :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \ + canonical form. + :returns: A list of the full text of the ``.bbl`` files (if any) \ + or ``None``. + """ + tf = get_sources(arxiv_id) + bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")] + bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING) + for member in bbl_files] + return bbl_files + + +def get_citations(arxiv_id): + """ + TODO + """ + assert(False) diff --git a/libbmc/tools.py b/libbmc/tools.py new file mode 100644 index 0000000..0088d3d --- /dev/null +++ b/libbmc/tools.py @@ -0,0 +1,37 @@ +""" +This file contains various utility functions. +""" + + +def replaceAll(text, replace_dict): + """ + Replace multiple strings in a text. + + :param text: Text to replace in. + :param replace_dict: Dictionary mapping strings to replace with their \ + substitution. + :returns: Text after replacements. + """ + for i, j in replace_dict.items(): + text = text.replace(i, j) + return text + + +def clean_whitespaces(text): + """ + Remove multiple whitespaces from text. + + :param text: Text to remove multiple whitespaces from. + :returns: A cleaned text. + """ + return ' '.join(text.strip().split()) + + +def remove_duplicates(some_list): + """ + Remove the duplicates from a list. + + :param some_list: List to remove duplicates from. + :returns: A list without duplicates. + """ + return list(set(some_list)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c721aa3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +isbnlib==3.5.7 +requests==2.9.1