Reimport bbl citations parsing and make some minor fixes

2015-12-27 23:46:43 +01:00 · 2015-12-27 23:46:43 +01:00 · d8b74ae356
commit d8b74ae356
parent 97eb5a3ae0
7 changed files with 271 additions and 51 deletions
--- a/libbmc/citations/bbl.py
+++ b/libbmc/citations/bbl.py
@ -0,0 +1,125 @@
+"""
+This files contains all the functions to deal with .bbl files.
+"""
+import os
+import re
+import requests
+import subprocess
+
+from requests.exception import RequestException
+
+from libbmc import doi
+from libbmc import tools
+from libbmc.repositories import arxiv
+
+
+# Regex to match bibitems
+BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
+# Regex to match end of bibliography
+ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
+
+
+# CrossRef API URL
+CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
+CROSSREF_MAX_BATCH_SIZE = 10
+
+
+def bibitem_as_plaintext(bibitem):
+    """
+    Return a plaintext representation of the bibitem from the ``.bbl`` file.
+
+    .. note::
+
+        This plaintext representation can be super ugly, contain URLs and so \
+        on.
+
+    :param bibitem: The text content of the bibitem.
+    :returns: A cleaned plaintext citation from the bibitem.
+    """
+    try:
+        output = subprocess.check_output(["delatex",
+                                          "-s"],
+                                         input=bibitem.encode("utf-8"))
+    except FileNotFoundError:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        output = subprocess.check_output(["%s/../external/opendetex/delatex" %
+                                          (script_dir,),
+                                          "-s"],
+                                         input=bibitem.encode("utf-8"))
+    output = output.decode("utf-8")
+    output = tools.clean_whitespaces(output)
+    return output
+
+
+def get_plaintext_citations(bbl):
+    """
+    Parse a ``*.bbl`` file to get a clean list of plaintext citations.
+
+    :param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
+            file.
+    :returns:  A list of cleaned plaintext citations.
+    """
+    # Handle path or content
+    if os.path.isfile(bbl):
+        with open(bbl, 'r') as fh:
+            bbl_content = fh.read()
+    else:
+        bbl_content = bbl
+    # Get a list of bibitems, taking the first item out as it is *before* the
+    # first \bibitem
+    bibitems = BIBITEMS_REGEX.split(bbl_content)[1:]
+    # Delete the text after the \end{thebibliography}
+    bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems]
+    # Clean every bibitem to have plaintext
+    cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems]
+    return cleaned_bbl
+
+
+def get_cited_DOIs(bbl):
+    """
+    Get the DOIs of the papers cited in this .bbl file.
+
+    :param bbl: Either the path to a .bbl file or the content of a .bbl file.
+
+    :returns: A dict of cleaned plaintext citations and their associated DOI.
+    """
+    dois = {}
+    crossref_queue = []
+    # Get the plaintext citations from the bbl file
+    plaintext_citations = get_plaintext_citations(bbl)
+    # Try to get the DOI directly from the citation
+    for citation in plaintext_citations[:]:
+        # Some citations already contain a DOI so try to match it directly
+        matched_DOIs = doi.extract_from_text(citation)
+        if matched_DOIs is not None:
+            # Add the DOI and go on
+            dois[citation] = matched_DOIs[0]
+            continue
+        # Same thing for arXiv id
+        matched_arXiv = arxiv.extract_from_text(citation)
+        if matched_arXiv is not None:
+            # Add the associated DOI and go on
+            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
+            continue
+        # If no match found, stack it for next step
+        # Note to remove URLs in the citation as the plaintext citations can
+        # contain URLs and they are bad for the CrossRef API.
+        crossref_queue.append(tools.remove_URLs(citation))
+    # Do batch of papers, to prevent from the timeout of crossref
+    for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
+        try:
+            # Fetch results from CrossRef
+            r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
+            for result in r.json()["results"]:
+                # Try to get a DOI
+                try:
+                    dois[result["text"]] = result["doi"]
+                except KeyError:
+                    # Or set it to None
+                    dois[result["text"]] = None
+        except (RequestException, ValueError, KeyError):
+            # If an exception occurred, set all the DOIs to None for the
+            # current batch
+            for i in batch:
+                dois[i] = None
+    return dois
--- a/libbmc/doi.py
+++ b/libbmc/doi.py
@ -4,14 +4,16 @@ This file contains all the DOI-related functions.
 import re
 import requests

+from requests.exception import RequestException
+
 from libbmc import tools

 # Taken from
 # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
-regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
+REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
                   re.IGNORECASE)
 # Base dx.doi.org URL for redirections
-dx_url = "http://dx.doi.org/{doi}"
+DX_URL = "http://dx.doi.org/{doi}"


 def is_valid(doi):
@ -21,7 +23,7 @@ def is_valid(doi):
    :param doi: The DOI to be checked.
    :returns: Boolean indicating whether the DOI is valid or not.
    """
-    match = regex.match(doi)
+    match = REGEX.match(doi)
    return ((match is not None) and (match.group(0) == doi))


@ -32,7 +34,7 @@ def extract_from_text(text):
    :param text: The text to extract DOIs from.
    :returns: A list of found DOIs.
    """
-    return tools.remove_duplicates(regex.findall(text))
+    return tools.remove_duplicates(REGEX.findall(text))


 def to_URL(dois):
@ -43,9 +45,9 @@ def to_URL(dois):
    :returns: A list of DOIs URLs.
    """
    if isinstance(dois, list):
-        return [dx_url.format(doi=doi) for doi in dois]
+        return [DX_URL.format(doi=doi) for doi in dois]
    else:
-        return dx_url.format(doi=dois)
+        return DX_URL.format(doi=dois)


 def to_canonical(urls):
@ -73,13 +75,13 @@ def get_oa_version(doi):
    :returns: The URL of the OA version of the given DOI, or ``None``.
    """
    # If DOI is a link, truncate it
-    r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
    try:
-        assert(r.status_code == requests.codes.ok)
+        r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
+        r.raise_for_status()
        result = r.json()
        assert(result["status"] == "ok")
        return result["paper"]["pdf_url"]
-    except (AssertionError, ValueError, KeyError):
+    except (AssertionError, ValueError, KeyError, RequestException):
        return None


@ -90,8 +92,11 @@ def get_linked_version(doi):
    :param doi: A canonical DOI.
    :returns: The canonical URL behind the DOI, or ``None``.
    """
+    try:
        r = requests.head(to_URL(doi))
        return r.headers.get("location")
+    except RequestException:
+        return None


 def get_bibtex(doi):
@ -105,9 +110,11 @@ def get_bibtex(doi):
    :param doi: The canonical DOI to get BibTeX from.
    :returns: A BibTeX string or ``None``.
    """
+    try:
        r = requests.get(to_URL(doi),
                         headers={"accept": "application/x-bibtex"})
-    if r.headers.get("content-type") == "application/x-bibtex":
+        r.raise_for_status()
+        assert(r.headers.get("content-type") == "application/x-bibtex")
        return r.text
-    else:
+    except (RequestException, AssertionError):
        return None
--- a/libbmc/isbn.py
+++ b/libbmc/isbn.py
@ -35,10 +35,10 @@ def get_bibtex(isbn):
    :param isbn: ISBN to fetch BibTeX entry for.
    :returns: A BibTeX string.
    """
-    return doi.get_bibtex(to_doi(isbn))
+    return doi.get_bibtex(to_DOI(isbn))


-def to_doi(isbn):
+def to_DOI(isbn):
    """
    Try to fetch a DOI from a given ISBN.

--- a/libbmc/repositories/arxiv.py
+++ b/libbmc/repositories/arxiv.py
@ -1,17 +1,23 @@
 """
 This file contains all the arXiv-related functions.
 """
+import arxiv2bib
 import io
 import re
 import requests
 import tarfile
 import xml.etree.ElementTree

+from urllib.error import HTTPError
+from requests.exception import RequestException
+
+
 from libbmc import tools
+from libbmc.citations import bbl


-arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
-arxiv_identifier_before_2007 = r"(" + ("|".join([
+ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
+ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
    "astro-ph.GA",
    "astro-ph.CO",
    "astro-ph.EP",
@ -159,15 +165,15 @@ arxiv_identifier_before_2007 = r"(" + ("|".join([
    "stat.ME",
    "stat.OT",
    "stat.TH"])) + r")/\d+"
-regex = re.compile(
-    "(" + arxiv_identifier_from_2007 + ")|(" +
-    arxiv_identifier_before_2007 + ")",
+REGEX = re.compile(
+    "(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" +
+    ARXIV_IDENTIFIER_BEFORE_2007 + ")",
    re.IGNORECASE)

 # Base arXiv URL used as id sometimes
-arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
+ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"
 # Eprint URL used to download sources
-arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
+ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"


 def is_valid(arxiv_id):
@ -177,15 +183,35 @@ def is_valid(arxiv_id):
    :param arxiv_id: The arXiv ID to be checked.
    :returns: Boolean indicating whether the arXiv ID is valid or not.
    """
-    match = regex.match(arxiv_id)
+    match = REGEX.match(arxiv_id)
    return ((match is not None) and (match.group(0) == arxiv_id))


 def get_bibtex(arxiv_id):
    """
-    TODO
+    Get a BibTeX entry for a given DOI.
+
+    .. note::
+
+        Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.
+
+    :param arxiv_id: The canonical arXiv id to get BibTeX from.
+    :returns: A BibTeX string or ``None``.
    """
-    assert(False)
+    # Fetch bibtex using arxiv2bib module
+    try:
+        bibtex = arxiv2bib.arxiv2bib([arxiv_id])
+    except HTTPError:
+        bibtex = []
+
+    for bib in bibtex:
+        if isinstance(bib, arxiv2bib.ReferenceErrorInfo):
+            continue
+        else:
+            # Return fetched bibtex
+            return bib.bibtex()
+    # An error occurred, return None
+    return None


 def extract_from_text(text):
@ -195,7 +221,7 @@ def extract_from_text(text):
    :param text: The text to extract arXiv IDs from.
    :returns: A list of matching arXiv IDs.
    """
-    return tools.remove_duplicates(regex.findall(text))
+    return tools.remove_duplicates(REGEX.findall(text))


 def to_URL(arxiv_ids):
@ -206,9 +232,9 @@ def to_URL(arxiv_ids):
    :returns: A list of DOIs URLs.
    """
    if isinstance(arxiv_ids, list):
-        return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
+        return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
    else:
-        return arxiv_url.format(arxiv_id=arxiv_ids)
+        return ARXIV_URL.format(arxiv_id=arxiv_ids)


 def to_canonical(urls):
@ -236,11 +262,15 @@ def from_doi(doi):
    :param doi: The DOI of the resource to look for.
    :returns: The arXiv eprint id, or ``None`` if not found.
    """
+    try:
        r = requests.get("http://export.arxiv.org/api/query",
                         params={
                             "search_query": "doi:%s" % (doi,),
                             "max_results": 1
                         })
+        r.raise_for_status()
+    except RequestException:
+        return None
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        id = entry.find("{http://www.w3.org/2005/Atom}id").text
@ -250,7 +280,7 @@ def from_doi(doi):
    return None


-def to_doi(arxiv_id):
+def to_DOI(arxiv_id):
    """
    Get the associated DOI for a given arXiv eprint.

@ -262,11 +292,15 @@ def to_doi(arxiv_id):
    :param eprint: The arXiv eprint id.
    :returns: The DOI if any, or ``None``.
    """
+    try:
        r = requests.get("http://export.arxiv.org/api/query",
                         params={
                             "id_list": arxiv_id,
                             "max_results": 1
                         })
+        r.raise_for_status()
+    except RequestException:
+        return None
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        doi = entry.find("{http://arxiv.org/schemas/atom}doi")
@ -284,12 +318,12 @@ def get_sources(arxiv_id):
    :returns: A ``TarFile`` object of the sources of the arXiv preprint or \
            ``None``.
    """
-    r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
    try:
-        assert(r.status_code == requests.codes.ok)
+        r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
+        r.raise_for_status()
        file_object = io.BytesIO(r.content)
        return tarfile.open(fileobj=file_object)
-    except (AssertionError, tarfile.TarError):
+    except (RequestException, AssertionError, tarfile.TarError):
        return None


@ -297,8 +331,8 @@ def get_bbl(arxiv_id):
    """
    Get the .bbl files (if any) of a given preprint.

-    :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
-            canonical form.
+    :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
+            a canonical form.
    :returns: A list of the full text of the ``.bbl`` files (if any) \
            or ``None``.
    """
@ -311,6 +345,16 @@ def get_bbl(arxiv_id):

 def get_citations(arxiv_id):
    """
-    TODO
+    Get the DOIs cited by a given preprint.
+
+    :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
+            a canonical form.
+    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
-    assert(False)
+    dois = {}
+    # Get the list of bbl files for this preprint
+    bbl_files = get_bbl(arxiv_id)
+    for bbl_file in bbl_files:
+        # Fetch the cited DOIs for each of the bbl files
+        dois.update(bbl.get_cited_DOIs(bbl_file))
+    return dois
--- a/libbmc/repositories/hal.py
+++ b/libbmc/repositories/hal.py
@ -0,0 +1 @@
+# TODO
--- a/libbmc/tools.py
+++ b/libbmc/tools.py
@ -1,6 +1,12 @@
 """
 This file contains various utility functions.
 """
+import re
+from itertools import islice, chain
+
+
+# Huge URL regex taken from https://gist.github.com/gruber/8891611
+URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")


 def replaceAll(text, replace_dict):
@ -19,7 +25,8 @@ def replaceAll(text, replace_dict):

 def clean_whitespaces(text):
    """
-    Remove multiple whitespaces from text.
+    Remove multiple whitespaces from text. Also removes leading and trailing \
+    whitespaces.

    :param text: Text to remove multiple whitespaces from.
    :returns: A cleaned text.
@ -35,3 +42,38 @@ def remove_duplicates(some_list):
    :returns: A list without duplicates.
    """
    return list(set(some_list))
+
+
+def batch(iterable, size):
+    """
+    Get items from a sequence a batch at a time.
+
+    .. note:
+
+        Adapted from
+        https://code.activestate.com/recipes/303279-getting-items-in-batches/.
+
+
+    .. note:
+
+        All batches must be exhausted immediately.
+
+    :params iterable: An iterable to get batches from.
+    :params size: Size of the batches.
+    :returns: A new batch of the given size at each time.
+    """
+    it = iter(iterable)
+    while True:
+        bi = islice(it, size)
+        yield chain([bi.next()], bi)
+
+
+def remove_URLs(text):
+    """
+    Remove URLs from a given text (only removes http, https and naked domains \
+    URLs).
+
+    :param text: The text to remove URLs from.
+    :returns: The text without URLs.
+    """
+    return clean_whitespaces(URL_REGEX.sub("", text))
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
-isbnlib==3.5.7
-requests==2.9.1
+arxiv2bib>=1.0.7
+isbnlib>=3.5.7
+requests>=2.9.1