Reimport bbl citations parsing and make some minor fixes

2015-12-27 23:46:43 +01:00 · 2015-12-27 23:46:43 +01:00 · d8b74ae356
parent 97eb5a3ae0
commit d8b74ae356
7 changed files with 271 additions and 51 deletions
--- a/libbmc/citations/bbl.py
+++ b/libbmc/citations/bbl.py
@ -0,0 +1,125 @@
 """
 This files contains all the functions to deal with .bbl files.
 """
 import os
 import re
 import requests
 import subprocess
 from requests.exception import RequestException
 from libbmc import doi
 from libbmc import tools
 from libbmc.repositories import arxiv
 # Regex to match bibitems
 BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
 # Regex to match end of bibliography
 ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
 # CrossRef API URL
 CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
 CROSSREF_MAX_BATCH_SIZE = 10
 def bibitem_as_plaintext(bibitem):
    """
    Return a plaintext representation of the bibitem from the ``.bbl`` file.
    .. note::
        This plaintext representation can be super ugly, contain URLs and so \
        on.
    :param bibitem: The text content of the bibitem.
    :returns: A cleaned plaintext citation from the bibitem.
    """
    try:
        output = subprocess.check_output(["delatex",
                                          "-s"],
                                         input=bibitem.encode("utf-8"))
    except FileNotFoundError:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        output = subprocess.check_output(["%s/../external/opendetex/delatex" %
                                          (script_dir,),
                                          "-s"],
                                         input=bibitem.encode("utf-8"))
    output = output.decode("utf-8")
    output = tools.clean_whitespaces(output)
    return output
 def get_plaintext_citations(bbl):
    """
    Parse a ``*.bbl`` file to get a clean list of plaintext citations.
    :param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
            file.
    :returns:  A list of cleaned plaintext citations.
    """
    # Handle path or content
    if os.path.isfile(bbl):
        with open(bbl, 'r') as fh:
            bbl_content = fh.read()
    else:
        bbl_content = bbl
    # Get a list of bibitems, taking the first item out as it is *before* the
    # first \bibitem
    bibitems = BIBITEMS_REGEX.split(bbl_content)[1:]
    # Delete the text after the \end{thebibliography}
    bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems]
    # Clean every bibitem to have plaintext
    cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems]
    return cleaned_bbl
 def get_cited_DOIs(bbl):
    """
    Get the DOIs of the papers cited in this .bbl file.
    :param bbl: Either the path to a .bbl file or the content of a .bbl file.
    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
    dois = {}
    crossref_queue = []
    # Get the plaintext citations from the bbl file
    plaintext_citations = get_plaintext_citations(bbl)
    # Try to get the DOI directly from the citation
    for citation in plaintext_citations[:]:
        # Some citations already contain a DOI so try to match it directly
        matched_DOIs = doi.extract_from_text(citation)
        if matched_DOIs is not None:
            # Add the DOI and go on
            dois[citation] = matched_DOIs[0]
            continue
        # Same thing for arXiv id
        matched_arXiv = arxiv.extract_from_text(citation)
        if matched_arXiv is not None:
            # Add the associated DOI and go on
            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
            continue
        # If no match found, stack it for next step
        # Note to remove URLs in the citation as the plaintext citations can
        # contain URLs and they are bad for the CrossRef API.
        crossref_queue.append(tools.remove_URLs(citation))
    # Do batch of papers, to prevent from the timeout of crossref
    for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
        try:
            # Fetch results from CrossRef
            r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
            for result in r.json()["results"]:
                # Try to get a DOI
                try:
                    dois[result["text"]] = result["doi"]
                except KeyError:
                    # Or set it to None
                    dois[result["text"]] = None
        except (RequestException, ValueError, KeyError):
            # If an exception occurred, set all the DOIs to None for the
            # current batch
            for i in batch:
                dois[i] = None
    return dois
--- a/libbmc/doi.py
+++ b/libbmc/doi.py
@ -4,14 +4,16 @@ This file contains all the DOI-related functions.
 import re
 import requests
 from requests.exception import RequestException
 from libbmc import tools
 # Taken from
 # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
-regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
+REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
                   re.IGNORECASE)
 # Base dx.doi.org URL for redirections
-dx_url = "http://dx.doi.org/{doi}"
+DX_URL = "http://dx.doi.org/{doi}"
 def is_valid(doi):
@ -21,7 +23,7 @@ def is_valid(doi):
    :param doi: The DOI to be checked.
    :returns: Boolean indicating whether the DOI is valid or not.
    """
-    match = regex.match(doi)
+    match = REGEX.match(doi)
    return ((match is not None) and (match.group(0) == doi))
@ -32,7 +34,7 @@ def extract_from_text(text):
    :param text: The text to extract DOIs from.
    :returns: A list of found DOIs.
    """
-    return tools.remove_duplicates(regex.findall(text))
+    return tools.remove_duplicates(REGEX.findall(text))
 def to_URL(dois):
@ -43,9 +45,9 @@ def to_URL(dois):
    :returns: A list of DOIs URLs.
    """
    if isinstance(dois, list):
-        return [dx_url.format(doi=doi) for doi in dois]
+        return [DX_URL.format(doi=doi) for doi in dois]
    else:
-        return dx_url.format(doi=dois)
+        return DX_URL.format(doi=dois)
 def to_canonical(urls):
@ -73,13 +75,13 @@ def get_oa_version(doi):
    :returns: The URL of the OA version of the given DOI, or ``None``.
    """
    # If DOI is a link, truncate it
    r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
    try:
-        assert(r.status_code == requests.codes.ok)
+        r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
        r.raise_for_status()
        result = r.json()
        assert(result["status"] == "ok")
        return result["paper"]["pdf_url"]
-    except (AssertionError, ValueError, KeyError):
+    except (AssertionError, ValueError, KeyError, RequestException):
        return None
@ -90,8 +92,11 @@ def get_linked_version(doi):
    :param doi: A canonical DOI.
    :returns: The canonical URL behind the DOI, or ``None``.
    """
    try:
        r = requests.head(to_URL(doi))
        return r.headers.get("location")
    except RequestException:
        return None
 def get_bibtex(doi):
@ -105,9 +110,11 @@ def get_bibtex(doi):
    :param doi: The canonical DOI to get BibTeX from.
    :returns: A BibTeX string or ``None``.
    """
    try:
        r = requests.get(to_URL(doi),
                         headers={"accept": "application/x-bibtex"})
-    if r.headers.get("content-type") == "application/x-bibtex":
+        r.raise_for_status()
        assert(r.headers.get("content-type") == "application/x-bibtex")
        return r.text
-    else:
+    except (RequestException, AssertionError):
        return None
--- a/libbmc/isbn.py
+++ b/libbmc/isbn.py
@ -35,10 +35,10 @@ def get_bibtex(isbn):
    :param isbn: ISBN to fetch BibTeX entry for.
    :returns: A BibTeX string.
    """
-    return doi.get_bibtex(to_doi(isbn))
+    return doi.get_bibtex(to_DOI(isbn))
-def to_doi(isbn):
+def to_DOI(isbn):
    """
    Try to fetch a DOI from a given ISBN.
--- a/libbmc/repositories/arxiv.py
+++ b/libbmc/repositories/arxiv.py
@ -1,17 +1,23 @@
 """
 This file contains all the arXiv-related functions.
 """
 import arxiv2bib
 import io
 import re
 import requests
 import tarfile
 import xml.etree.ElementTree
 from urllib.error import HTTPError
 from requests.exception import RequestException
 from libbmc import tools
 from libbmc.citations import bbl
-arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
+ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
-arxiv_identifier_before_2007 = r"(" + ("|".join([
+ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
    "astro-ph.GA",
    "astro-ph.CO",
    "astro-ph.EP",
@ -159,15 +165,15 @@ arxiv_identifier_before_2007 = r"(" + ("|".join([
    "stat.ME",
    "stat.OT",
    "stat.TH"])) + r")/\d+"
-regex = re.compile(
+REGEX = re.compile(
-    "(" + arxiv_identifier_from_2007 + ")|(" +
+    "(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" +
-    arxiv_identifier_before_2007 + ")",
+    ARXIV_IDENTIFIER_BEFORE_2007 + ")",
    re.IGNORECASE)
 # Base arXiv URL used as id sometimes
-arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
+ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"
 # Eprint URL used to download sources
-arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
+ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"
 def is_valid(arxiv_id):
@ -177,15 +183,35 @@ def is_valid(arxiv_id):
    :param arxiv_id: The arXiv ID to be checked.
    :returns: Boolean indicating whether the arXiv ID is valid or not.
    """
-    match = regex.match(arxiv_id)
+    match = REGEX.match(arxiv_id)
    return ((match is not None) and (match.group(0) == arxiv_id))
 def get_bibtex(arxiv_id):
    """
-    TODO
+    Get a BibTeX entry for a given DOI.
    .. note::
        Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.
    :param arxiv_id: The canonical arXiv id to get BibTeX from.
    :returns: A BibTeX string or ``None``.
    """
-    assert(False)
+    # Fetch bibtex using arxiv2bib module
    try:
        bibtex = arxiv2bib.arxiv2bib([arxiv_id])
    except HTTPError:
        bibtex = []
    for bib in bibtex:
        if isinstance(bib, arxiv2bib.ReferenceErrorInfo):
            continue
        else:
            # Return fetched bibtex
            return bib.bibtex()
    # An error occurred, return None
    return None
 def extract_from_text(text):
@ -195,7 +221,7 @@ def extract_from_text(text):
    :param text: The text to extract arXiv IDs from.
    :returns: A list of matching arXiv IDs.
    """
-    return tools.remove_duplicates(regex.findall(text))
+    return tools.remove_duplicates(REGEX.findall(text))
 def to_URL(arxiv_ids):
@ -206,9 +232,9 @@ def to_URL(arxiv_ids):
    :returns: A list of DOIs URLs.
    """
    if isinstance(arxiv_ids, list):
-        return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
+        return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
    else:
-        return arxiv_url.format(arxiv_id=arxiv_ids)
+        return ARXIV_URL.format(arxiv_id=arxiv_ids)
 def to_canonical(urls):
@ -236,11 +262,15 @@ def from_doi(doi):
    :param doi: The DOI of the resource to look for.
    :returns: The arXiv eprint id, or ``None`` if not found.
    """
    try:
        r = requests.get("http://export.arxiv.org/api/query",
                         params={
                             "search_query": "doi:%s" % (doi,),
                             "max_results": 1
                         })
        r.raise_for_status()
    except RequestException:
        return None
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        id = entry.find("{http://www.w3.org/2005/Atom}id").text
@ -250,7 +280,7 @@ def from_doi(doi):
    return None
-def to_doi(arxiv_id):
+def to_DOI(arxiv_id):
    """
    Get the associated DOI for a given arXiv eprint.
@ -262,11 +292,15 @@ def to_doi(arxiv_id):
    :param eprint: The arXiv eprint id.
    :returns: The DOI if any, or ``None``.
    """
    try:
        r = requests.get("http://export.arxiv.org/api/query",
                         params={
                             "id_list": arxiv_id,
                             "max_results": 1
                         })
        r.raise_for_status()
    except RequestException:
        return None
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        doi = entry.find("{http://arxiv.org/schemas/atom}doi")
@ -284,12 +318,12 @@ def get_sources(arxiv_id):
    :returns: A ``TarFile`` object of the sources of the arXiv preprint or \
            ``None``.
    """
    r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
    try:
-        assert(r.status_code == requests.codes.ok)
+        r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
        r.raise_for_status()
        file_object = io.BytesIO(r.content)
        return tarfile.open(fileobj=file_object)
-    except (AssertionError, tarfile.TarError):
+    except (RequestException, AssertionError, tarfile.TarError):
        return None
@ -297,8 +331,8 @@ def get_bbl(arxiv_id):
    """
    Get the .bbl files (if any) of a given preprint.
-    :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
+    :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
-            canonical form.
+            a canonical form.
    :returns: A list of the full text of the ``.bbl`` files (if any) \
            or ``None``.
    """
@ -311,6 +345,16 @@ def get_bbl(arxiv_id):
 def get_citations(arxiv_id):
    """
-    TODO
+    Get the DOIs cited by a given preprint.
    :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
            a canonical form.
    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
-    assert(False)
+    dois = {}
    # Get the list of bbl files for this preprint
    bbl_files = get_bbl(arxiv_id)
    for bbl_file in bbl_files:
        # Fetch the cited DOIs for each of the bbl files
        dois.update(bbl.get_cited_DOIs(bbl_file))
    return dois
--- a/libbmc/repositories/hal.py
+++ b/libbmc/repositories/hal.py
@ -0,0 +1 @@
 # TODO
--- a/libbmc/tools.py
+++ b/libbmc/tools.py
@ -1,6 +1,12 @@
 """
 This file contains various utility functions.
 """
 import re
 from itertools import islice, chain
 # Huge URL regex taken from https://gist.github.com/gruber/8891611
 URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
 def replaceAll(text, replace_dict):
@ -19,7 +25,8 @@ def replaceAll(text, replace_dict):
 def clean_whitespaces(text):
    """
-    Remove multiple whitespaces from text.
+    Remove multiple whitespaces from text. Also removes leading and trailing \
    whitespaces.
    :param text: Text to remove multiple whitespaces from.
    :returns: A cleaned text.
@ -35,3 +42,38 @@ def remove_duplicates(some_list):
    :returns: A list without duplicates.
    """
    return list(set(some_list))
 def batch(iterable, size):
    """
    Get items from a sequence a batch at a time.
    .. note:
        Adapted from
        https://code.activestate.com/recipes/303279-getting-items-in-batches/.
    .. note:
        All batches must be exhausted immediately.
    :params iterable: An iterable to get batches from.
    :params size: Size of the batches.
    :returns: A new batch of the given size at each time.
    """
    it = iter(iterable)
    while True:
        bi = islice(it, size)
        yield chain([bi.next()], bi)
 def remove_URLs(text):
    """
    Remove URLs from a given text (only removes http, https and naked domains \
    URLs).
    :param text: The text to remove URLs from.
    :returns: The text without URLs.
    """
    return clean_whitespaces(URL_REGEX.sub("", text))
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
-isbnlib==3.5.7
+arxiv2bib>=1.0.7
-requests==2.9.1
+isbnlib>=3.5.7
 requests>=2.9.1