Add a citation fetcher for plaintext, and factorize code with bbl citations fetcher

2015-12-28 00:21:41 +01:00 · 2015-12-28 00:21:41 +01:00 · 0d17254f6c
commit 0d17254f6c
parent bd0016cb51
2 changed files with 108 additions and 52 deletions
--- a/libbmc/citations/bbl.py
+++ b/libbmc/citations/bbl.py
@ -1,16 +1,13 @@
 """
-This files contains all the functions to deal with .bbl files.
+This files contains all the functions to extract DOIs of citations from .bbl
 files.
 """
 import os
 import re
 import requests
 import subprocess
 from requests.exception import RequestException
 from libbmc import doi
 from libbmc import tools
-from libbmc.repositories import arxiv
+from libbmc.citations import bbl
 # Regex to match bibitems
@ -19,11 +16,6 @@ BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
 ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
 # CrossRef API URL
 CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
 CROSSREF_MAX_BATCH_SIZE = 10
 def bibitem_as_plaintext(bibitem):
    """
    Return a plaintext representation of the bibitem from the ``.bbl`` file.
@ -75,51 +67,16 @@ def get_plaintext_citations(bbl):
    return cleaned_bbl
-def get_cited_DOIs(bbl):
+def get_cited_DOIs(bbl_input):
    """
    Get the DOIs of the papers cited in this .bbl file.
-    :param bbl: Either the path to a .bbl file or the content of a .bbl file.
+    :param bbl_input: Either the path to a .bbl file or the content \
            of a .bbl file.
    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
    dois = {}
    crossref_queue = []
    # Get the plaintext citations from the bbl file
-    plaintext_citations = get_plaintext_citations(bbl)
+    plaintext_citations = get_plaintext_citations(bbl_input)
-    # Try to get the DOI directly from the citation
+    # Use the plaintext citations parser on these citations
-    for citation in plaintext_citations[:]:
+    return bbl.get_cited_DOIs(plaintext_citations)
        # Some citations already contain a DOI so try to match it directly
        matched_DOIs = doi.extract_from_text(citation)
        if matched_DOIs is not None:
            # Add the DOI and go on
            dois[citation] = matched_DOIs[0]
            continue
        # Same thing for arXiv id
        matched_arXiv = arxiv.extract_from_text(citation)
        if matched_arXiv is not None:
            # Add the associated DOI and go on
            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
            continue
        # If no match found, stack it for next step
        # Note to remove URLs in the citation as the plaintext citations can
        # contain URLs and they are bad for the CrossRef API.
        crossref_queue.append(tools.remove_URLs(citation))
    # Do batch of papers, to prevent from the timeout of crossref
    for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
        try:
            # Fetch results from CrossRef
            r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
            for result in r.json()["results"]:
                # Try to get a DOI
                try:
                    dois[result["text"]] = result["doi"]
                except KeyError:
                    # Or set it to None
                    dois[result["text"]] = None
        except (RequestException, ValueError, KeyError):
            # If an exception occurred, set all the DOIs to None for the
            # current batch
            for i in batch:
                dois[i] = None
    return dois
--- a/libbmc/citations/plaintext.py
+++ b/libbmc/citations/plaintext.py
@ -0,0 +1,99 @@
 """
 This files contains all the functions to extract DOIs of citations from
 plaintext files.
 """
 import os
 import requests
 from requests.exception import RequestException
 from libbmc import doi
 from libbmc import tools
 from libbmc.repositories import arxiv
 # CrossRef API URL
 CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
 CROSSREF_MAX_BATCH_SIZE = 10
 def get_plaintext_citations(file):
    """
    Parse a plaintext file to get a clean list of plaintext citations. The \
            file should have one citation per line.
    :param file: Either the path to the plaintext file or the content of a \
            plaintext file.
    :returns:  A list of cleaned plaintext citations.
    """
    # Handle path or content
    if os.path.isfile(file):
        with open(file, 'r') as fh:
            content = fh.readlines()
    else:
        content = file.splitlines()
    # Clean every line to have plaintext
    cleaned_citations = [tools.clean_whitespaces(line) for line in content]
    return cleaned_citations
 def get_cited_DOIs(file):
    """
    Get the DOIs of the papers cited in a plaintext file. The file should \
            have one citation per line.
    .. note::
        This function is also used as a backend tool by most of the others \
        citations processors, to factorize the code.
    :param file: Either the path to the plaintext file or the content of a \
            plaintext file. It can also be a parsed list of plaintext \
            citations and, in this case, no preprocessing is done.
    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
    # If file is not a pre-processed list of plaintext citations
    if not isinstance(file, list):
        # It is either a path to a plaintext file or the content of a plaintext
        # file, we need some pre-processing to get a list of citations.
        plaintext_citations = get_plaintext_citations(file)
    dois = {}
    crossref_queue = []
    # Try to get the DOI directly from the citation
    for citation in plaintext_citations[:]:
        # Some citations already contain a DOI so try to match it directly
        matched_DOIs = doi.extract_from_text(citation)
        if matched_DOIs is not None:
            # Add the DOI and go on
            dois[citation] = matched_DOIs[0]
            continue
        # Same thing for arXiv id
        matched_arXiv = arxiv.extract_from_text(citation)
        if matched_arXiv is not None:
            # Add the associated DOI and go on
            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
            continue
        # If no match found, stack it for next step
        # Note to remove URLs in the citation as the plaintext citations can
        # contain URLs and they are bad for the CrossRef API.
        crossref_queue.append(tools.remove_URLs(citation))
    # Do batch with remaining papers, to prevent from the timeout of CrossRef
    for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
        try:
            # Fetch results from CrossRef
            r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
            for result in r.json()["results"]:
                # Try to get a DOI
                try:
                    dois[result["text"]] = result["doi"]
                except KeyError:
                    # Or set it to None
                    dois[result["text"]] = None
        except (RequestException, ValueError, KeyError):
            # If an exception occurred, set all the DOIs to None for the
            # current batch
            for i in batch:
                dois[i] = None
    return dois