From d8b74ae356f7398c601a36135039475530a83e1c Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Sun, 27 Dec 2015 23:46:43 +0100 Subject: [PATCH] Reimport bbl citations parsing and make some minor fixes --- libbmc/citations/bbl.py | 125 +++++++++++++++++++++++++++++++++++ libbmc/doi.py | 37 ++++++----- libbmc/isbn.py | 4 +- libbmc/repositories/arxiv.py | 106 ++++++++++++++++++++--------- libbmc/repositories/hal.py | 1 + libbmc/tools.py | 44 +++++++++++- requirements.txt | 5 +- 7 files changed, 271 insertions(+), 51 deletions(-) create mode 100644 libbmc/citations/bbl.py create mode 100644 libbmc/repositories/hal.py diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py new file mode 100644 index 0000000..10aef3e --- /dev/null +++ b/libbmc/citations/bbl.py @@ -0,0 +1,125 @@ +""" +This files contains all the functions to deal with .bbl files. +""" +import os +import re +import requests +import subprocess + +from requests.exception import RequestException + +from libbmc import doi +from libbmc import tools +from libbmc.repositories import arxiv + + +# Regex to match bibitems +BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}") +# Regex to match end of bibliography +ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*") + + +# CrossRef API URL +CROSSREF_LINKS_API_URL = "http://search.crossref.org/links" +CROSSREF_MAX_BATCH_SIZE = 10 + + +def bibitem_as_plaintext(bibitem): + """ + Return a plaintext representation of the bibitem from the ``.bbl`` file. + + .. note:: + + This plaintext representation can be super ugly, contain URLs and so \ + on. + + :param bibitem: The text content of the bibitem. + :returns: A cleaned plaintext citation from the bibitem. + """ + try: + output = subprocess.check_output(["delatex", + "-s"], + input=bibitem.encode("utf-8")) + except FileNotFoundError: + script_dir = os.path.dirname(os.path.abspath(__file__)) + output = subprocess.check_output(["%s/../external/opendetex/delatex" % + (script_dir,), + "-s"], + input=bibitem.encode("utf-8")) + output = output.decode("utf-8") + output = tools.clean_whitespaces(output) + return output + + +def get_plaintext_citations(bbl): + """ + Parse a ``*.bbl`` file to get a clean list of plaintext citations. + + :param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \ + file. + :returns: A list of cleaned plaintext citations. + """ + # Handle path or content + if os.path.isfile(bbl): + with open(bbl, 'r') as fh: + bbl_content = fh.read() + else: + bbl_content = bbl + # Get a list of bibitems, taking the first item out as it is *before* the + # first \bibitem + bibitems = BIBITEMS_REGEX.split(bbl_content)[1:] + # Delete the text after the \end{thebibliography} + bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems] + # Clean every bibitem to have plaintext + cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems] + return cleaned_bbl + + +def get_cited_DOIs(bbl): + """ + Get the DOIs of the papers cited in this .bbl file. + + :param bbl: Either the path to a .bbl file or the content of a .bbl file. + + :returns: A dict of cleaned plaintext citations and their associated DOI. + """ + dois = {} + crossref_queue = [] + # Get the plaintext citations from the bbl file + plaintext_citations = get_plaintext_citations(bbl) + # Try to get the DOI directly from the citation + for citation in plaintext_citations[:]: + # Some citations already contain a DOI so try to match it directly + matched_DOIs = doi.extract_from_text(citation) + if matched_DOIs is not None: + # Add the DOI and go on + dois[citation] = matched_DOIs[0] + continue + # Same thing for arXiv id + matched_arXiv = arxiv.extract_from_text(citation) + if matched_arXiv is not None: + # Add the associated DOI and go on + dois[citation] = arxiv.to_DOI(matched_arXiv[0]) + continue + # If no match found, stack it for next step + # Note to remove URLs in the citation as the plaintext citations can + # contain URLs and they are bad for the CrossRef API. + crossref_queue.append(tools.remove_URLs(citation)) + # Do batch of papers, to prevent from the timeout of crossref + for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): + try: + # Fetch results from CrossRef + r = requests.post(CROSSREF_LINKS_API_URL, json=batch) + for result in r.json()["results"]: + # Try to get a DOI + try: + dois[result["text"]] = result["doi"] + except KeyError: + # Or set it to None + dois[result["text"]] = None + except (RequestException, ValueError, KeyError): + # If an exception occurred, set all the DOIs to None for the + # current batch + for i in batch: + dois[i] = None + return dois diff --git a/libbmc/doi.py b/libbmc/doi.py index 70860df..45b46ea 100644 --- a/libbmc/doi.py +++ b/libbmc/doi.py @@ -4,14 +4,16 @@ This file contains all the DOI-related functions. import re import requests +from requests.exception import RequestException + from libbmc import tools # Taken from # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802 -regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b", +REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b", re.IGNORECASE) # Base dx.doi.org URL for redirections -dx_url = "http://dx.doi.org/{doi}" +DX_URL = "http://dx.doi.org/{doi}" def is_valid(doi): @@ -21,7 +23,7 @@ def is_valid(doi): :param doi: The DOI to be checked. :returns: Boolean indicating whether the DOI is valid or not. """ - match = regex.match(doi) + match = REGEX.match(doi) return ((match is not None) and (match.group(0) == doi)) @@ -32,7 +34,7 @@ def extract_from_text(text): :param text: The text to extract DOIs from. :returns: A list of found DOIs. """ - return tools.remove_duplicates(regex.findall(text)) + return tools.remove_duplicates(REGEX.findall(text)) def to_URL(dois): @@ -43,9 +45,9 @@ def to_URL(dois): :returns: A list of DOIs URLs. """ if isinstance(dois, list): - return [dx_url.format(doi=doi) for doi in dois] + return [DX_URL.format(doi=doi) for doi in dois] else: - return dx_url.format(doi=dois) + return DX_URL.format(doi=dois) def to_canonical(urls): @@ -73,13 +75,13 @@ def get_oa_version(doi): :returns: The URL of the OA version of the given DOI, or ``None``. """ # If DOI is a link, truncate it - r = requests.get("http://beta.dissem.in/api/%s" % (doi,)) try: - assert(r.status_code == requests.codes.ok) + r = requests.get("http://beta.dissem.in/api/%s" % (doi,)) + r.raise_for_status() result = r.json() assert(result["status"] == "ok") return result["paper"]["pdf_url"] - except (AssertionError, ValueError, KeyError): + except (AssertionError, ValueError, KeyError, RequestException): return None @@ -90,8 +92,11 @@ def get_linked_version(doi): :param doi: A canonical DOI. :returns: The canonical URL behind the DOI, or ``None``. """ - r = requests.head(to_URL(doi)) - return r.headers.get("location") + try: + r = requests.head(to_URL(doi)) + return r.headers.get("location") + except RequestException: + return None def get_bibtex(doi): @@ -105,9 +110,11 @@ def get_bibtex(doi): :param doi: The canonical DOI to get BibTeX from. :returns: A BibTeX string or ``None``. """ - r = requests.get(to_URL(doi), - headers={"accept": "application/x-bibtex"}) - if r.headers.get("content-type") == "application/x-bibtex": + try: + r = requests.get(to_URL(doi), + headers={"accept": "application/x-bibtex"}) + r.raise_for_status() + assert(r.headers.get("content-type") == "application/x-bibtex") return r.text - else: + except (RequestException, AssertionError): return None diff --git a/libbmc/isbn.py b/libbmc/isbn.py index ce56322..594f93a 100644 --- a/libbmc/isbn.py +++ b/libbmc/isbn.py @@ -35,10 +35,10 @@ def get_bibtex(isbn): :param isbn: ISBN to fetch BibTeX entry for. :returns: A BibTeX string. """ - return doi.get_bibtex(to_doi(isbn)) + return doi.get_bibtex(to_DOI(isbn)) -def to_doi(isbn): +def to_DOI(isbn): """ Try to fetch a DOI from a given ISBN. diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py index 0b6525c..b013757 100644 --- a/libbmc/repositories/arxiv.py +++ b/libbmc/repositories/arxiv.py @@ -1,17 +1,23 @@ """ This file contains all the arXiv-related functions. """ +import arxiv2bib import io import re import requests import tarfile import xml.etree.ElementTree +from urllib.error import HTTPError +from requests.exception import RequestException + + from libbmc import tools +from libbmc.citations import bbl -arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?" -arxiv_identifier_before_2007 = r"(" + ("|".join([ +ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?" +ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([ "astro-ph.GA", "astro-ph.CO", "astro-ph.EP", @@ -159,15 +165,15 @@ arxiv_identifier_before_2007 = r"(" + ("|".join([ "stat.ME", "stat.OT", "stat.TH"])) + r")/\d+" -regex = re.compile( - "(" + arxiv_identifier_from_2007 + ")|(" + - arxiv_identifier_before_2007 + ")", +REGEX = re.compile( + "(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" + + ARXIV_IDENTIFIER_BEFORE_2007 + ")", re.IGNORECASE) # Base arXiv URL used as id sometimes -arxiv_url = "http://arxiv.org/abs/{arxiv_id}" +ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}" # Eprint URL used to download sources -arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}" +ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}" def is_valid(arxiv_id): @@ -177,15 +183,35 @@ def is_valid(arxiv_id): :param arxiv_id: The arXiv ID to be checked. :returns: Boolean indicating whether the arXiv ID is valid or not. """ - match = regex.match(arxiv_id) + match = REGEX.match(arxiv_id) return ((match is not None) and (match.group(0) == arxiv_id)) def get_bibtex(arxiv_id): """ - TODO + Get a BibTeX entry for a given DOI. + + .. note:: + + Using awesome https://pypi.python.org/pypi/arxiv2bib/ module. + + :param arxiv_id: The canonical arXiv id to get BibTeX from. + :returns: A BibTeX string or ``None``. """ - assert(False) + # Fetch bibtex using arxiv2bib module + try: + bibtex = arxiv2bib.arxiv2bib([arxiv_id]) + except HTTPError: + bibtex = [] + + for bib in bibtex: + if isinstance(bib, arxiv2bib.ReferenceErrorInfo): + continue + else: + # Return fetched bibtex + return bib.bibtex() + # An error occurred, return None + return None def extract_from_text(text): @@ -195,7 +221,7 @@ def extract_from_text(text): :param text: The text to extract arXiv IDs from. :returns: A list of matching arXiv IDs. """ - return tools.remove_duplicates(regex.findall(text)) + return tools.remove_duplicates(REGEX.findall(text)) def to_URL(arxiv_ids): @@ -206,9 +232,9 @@ def to_URL(arxiv_ids): :returns: A list of DOIs URLs. """ if isinstance(arxiv_ids, list): - return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids] + return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids] else: - return arxiv_url.format(arxiv_id=arxiv_ids) + return ARXIV_URL.format(arxiv_id=arxiv_ids) def to_canonical(urls): @@ -236,11 +262,15 @@ def from_doi(doi): :param doi: The DOI of the resource to look for. :returns: The arXiv eprint id, or ``None`` if not found. """ - r = requests.get("http://export.arxiv.org/api/query", - params={ - "search_query": "doi:%s" % (doi,), - "max_results": 1 - }) + try: + r = requests.get("http://export.arxiv.org/api/query", + params={ + "search_query": "doi:%s" % (doi,), + "max_results": 1 + }) + r.raise_for_status() + except RequestException: + return None e = xml.etree.ElementTree.fromstring(r.content) for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): id = entry.find("{http://www.w3.org/2005/Atom}id").text @@ -250,7 +280,7 @@ def from_doi(doi): return None -def to_doi(arxiv_id): +def to_DOI(arxiv_id): """ Get the associated DOI for a given arXiv eprint. @@ -262,11 +292,15 @@ def to_doi(arxiv_id): :param eprint: The arXiv eprint id. :returns: The DOI if any, or ``None``. """ - r = requests.get("http://export.arxiv.org/api/query", - params={ - "id_list": arxiv_id, - "max_results": 1 - }) + try: + r = requests.get("http://export.arxiv.org/api/query", + params={ + "id_list": arxiv_id, + "max_results": 1 + }) + r.raise_for_status() + except RequestException: + return None e = xml.etree.ElementTree.fromstring(r.content) for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): doi = entry.find("{http://arxiv.org/schemas/atom}doi") @@ -284,12 +318,12 @@ def get_sources(arxiv_id): :returns: A ``TarFile`` object of the sources of the arXiv preprint or \ ``None``. """ - r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id)) try: - assert(r.status_code == requests.codes.ok) + r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id)) + r.raise_for_status() file_object = io.BytesIO(r.content) return tarfile.open(fileobj=file_object) - except (AssertionError, tarfile.TarError): + except (RequestException, AssertionError, tarfile.TarError): return None @@ -297,8 +331,8 @@ def get_bbl(arxiv_id): """ Get the .bbl files (if any) of a given preprint. - :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \ - canonical form. + :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \ + a canonical form. :returns: A list of the full text of the ``.bbl`` files (if any) \ or ``None``. """ @@ -311,6 +345,16 @@ def get_bbl(arxiv_id): def get_citations(arxiv_id): """ - TODO + Get the DOIs cited by a given preprint. + + :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \ + a canonical form. + :returns: A dict of cleaned plaintext citations and their associated DOI. """ - assert(False) + dois = {} + # Get the list of bbl files for this preprint + bbl_files = get_bbl(arxiv_id) + for bbl_file in bbl_files: + # Fetch the cited DOIs for each of the bbl files + dois.update(bbl.get_cited_DOIs(bbl_file)) + return dois diff --git a/libbmc/repositories/hal.py b/libbmc/repositories/hal.py new file mode 100644 index 0000000..4640904 --- /dev/null +++ b/libbmc/repositories/hal.py @@ -0,0 +1 @@ +# TODO diff --git a/libbmc/tools.py b/libbmc/tools.py index 0088d3d..2116c06 100644 --- a/libbmc/tools.py +++ b/libbmc/tools.py @@ -1,6 +1,12 @@ """ This file contains various utility functions. """ +import re +from itertools import islice, chain + + +# Huge URL regex taken from https://gist.github.com/gruber/8891611 +URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?=1.0.7 +isbnlib>=3.5.7 +requests>=2.9.1