From 0d17254f6cd393fce7e79effcbc91a4df10ef045 Mon Sep 17 00:00:00 2001
From: "Phyks (Lucas Verney)" <phyks@phyks.me>
Date: Mon, 28 Dec 2015 00:21:41 +0100
Subject: [PATCH] Add a citation fetcher for plaintext, and factorize code with
 bbl citations fetcher

---
 libbmc/citations/bbl.py       | 61 ++++-----------------
 libbmc/citations/plaintext.py | 99 +++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 52 deletions(-)
 create mode 100644 libbmc/citations/plaintext.py

diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py
index 10aef3e..8de79ab 100644
--- a/libbmc/citations/bbl.py
+++ b/libbmc/citations/bbl.py
@@ -1,16 +1,13 @@
 """
-This files contains all the functions to deal with .bbl files.
+This files contains all the functions to extract DOIs of citations from .bbl
+files.
 """
 import os
 import re
-import requests
 import subprocess
 
-from requests.exception import RequestException
-
-from libbmc import doi
 from libbmc import tools
-from libbmc.repositories import arxiv
+from libbmc.citations import bbl
 
 
 # Regex to match bibitems
@@ -19,11 +16,6 @@ BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
 ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
 
 
-# CrossRef API URL
-CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
-CROSSREF_MAX_BATCH_SIZE = 10
-
-
 def bibitem_as_plaintext(bibitem):
     """
     Return a plaintext representation of the bibitem from the ``.bbl`` file.
@@ -75,51 +67,16 @@ def get_plaintext_citations(bbl):
     return cleaned_bbl
 
 
-def get_cited_DOIs(bbl):
+def get_cited_DOIs(bbl_input):
     """
     Get the DOIs of the papers cited in this .bbl file.
 
-    :param bbl: Either the path to a .bbl file or the content of a .bbl file.
+    :param bbl_input: Either the path to a .bbl file or the content \
+            of a .bbl file.
 
     :returns: A dict of cleaned plaintext citations and their associated DOI.
     """
-    dois = {}
-    crossref_queue = []
     # Get the plaintext citations from the bbl file
-    plaintext_citations = get_plaintext_citations(bbl)
-    # Try to get the DOI directly from the citation
-    for citation in plaintext_citations[:]:
-        # Some citations already contain a DOI so try to match it directly
-        matched_DOIs = doi.extract_from_text(citation)
-        if matched_DOIs is not None:
-            # Add the DOI and go on
-            dois[citation] = matched_DOIs[0]
-            continue
-        # Same thing for arXiv id
-        matched_arXiv = arxiv.extract_from_text(citation)
-        if matched_arXiv is not None:
-            # Add the associated DOI and go on
-            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
-            continue
-        # If no match found, stack it for next step
-        # Note to remove URLs in the citation as the plaintext citations can
-        # contain URLs and they are bad for the CrossRef API.
-        crossref_queue.append(tools.remove_URLs(citation))
-    # Do batch of papers, to prevent from the timeout of crossref
-    for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
-        try:
-            # Fetch results from CrossRef
-            r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
-            for result in r.json()["results"]:
-                # Try to get a DOI
-                try:
-                    dois[result["text"]] = result["doi"]
-                except KeyError:
-                    # Or set it to None
-                    dois[result["text"]] = None
-        except (RequestException, ValueError, KeyError):
-            # If an exception occurred, set all the DOIs to None for the
-            # current batch
-            for i in batch:
-                dois[i] = None
-    return dois
+    plaintext_citations = get_plaintext_citations(bbl_input)
+    # Use the plaintext citations parser on these citations
+    return bbl.get_cited_DOIs(plaintext_citations)
diff --git a/libbmc/citations/plaintext.py b/libbmc/citations/plaintext.py
new file mode 100644
index 0000000..58ea18c
--- /dev/null
+++ b/libbmc/citations/plaintext.py
@@ -0,0 +1,99 @@
+"""
+This files contains all the functions to extract DOIs of citations from
+plaintext files.
+"""
+import os
+import requests
+
+from requests.exception import RequestException
+
+from libbmc import doi
+from libbmc import tools
+from libbmc.repositories import arxiv
+
+
+# CrossRef API URL
+CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
+CROSSREF_MAX_BATCH_SIZE = 10
+
+
+def get_plaintext_citations(file):
+    """
+    Parse a plaintext file to get a clean list of plaintext citations. The \
+            file should have one citation per line.
+
+    :param file: Either the path to the plaintext file or the content of a \
+            plaintext file.
+    :returns:  A list of cleaned plaintext citations.
+    """
+    # Handle path or content
+    if os.path.isfile(file):
+        with open(file, 'r') as fh:
+            content = fh.readlines()
+    else:
+        content = file.splitlines()
+    # Clean every line to have plaintext
+    cleaned_citations = [tools.clean_whitespaces(line) for line in content]
+    return cleaned_citations
+
+
+def get_cited_DOIs(file):
+    """
+    Get the DOIs of the papers cited in a plaintext file. The file should \
+            have one citation per line.
+
+    .. note::
+
+        This function is also used as a backend tool by most of the others \
+        citations processors, to factorize the code.
+
+    :param file: Either the path to the plaintext file or the content of a \
+            plaintext file. It can also be a parsed list of plaintext \
+            citations and, in this case, no preprocessing is done.
+    :returns: A dict of cleaned plaintext citations and their associated DOI.
+    """
+    # If file is not a pre-processed list of plaintext citations
+    if not isinstance(file, list):
+        # It is either a path to a plaintext file or the content of a plaintext
+        # file, we need some pre-processing to get a list of citations.
+        plaintext_citations = get_plaintext_citations(file)
+    dois = {}
+    crossref_queue = []
+
+    # Try to get the DOI directly from the citation
+    for citation in plaintext_citations[:]:
+        # Some citations already contain a DOI so try to match it directly
+        matched_DOIs = doi.extract_from_text(citation)
+        if matched_DOIs is not None:
+            # Add the DOI and go on
+            dois[citation] = matched_DOIs[0]
+            continue
+        # Same thing for arXiv id
+        matched_arXiv = arxiv.extract_from_text(citation)
+        if matched_arXiv is not None:
+            # Add the associated DOI and go on
+            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
+            continue
+        # If no match found, stack it for next step
+        # Note to remove URLs in the citation as the plaintext citations can
+        # contain URLs and they are bad for the CrossRef API.
+        crossref_queue.append(tools.remove_URLs(citation))
+
+    # Do batch with remaining papers, to prevent from the timeout of CrossRef
+    for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
+        try:
+            # Fetch results from CrossRef
+            r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
+            for result in r.json()["results"]:
+                # Try to get a DOI
+                try:
+                    dois[result["text"]] = result["doi"]
+                except KeyError:
+                    # Or set it to None
+                    dois[result["text"]] = None
+        except (RequestException, ValueError, KeyError):
+            # If an exception occurred, set all the DOIs to None for the
+            # current batch
+            for i in batch:
+                dois[i] = None
+    return dois