From 985304446cf7da26b2dee8a43029f2964554df2e Mon Sep 17 00:00:00 2001
From: "Phyks (Lucas Verney)" <phyks@phyks.me>
Date: Wed, 23 Dec 2015 22:18:52 +0100
Subject: [PATCH] Plug arXiv

---
 README.md |   3 +-
 main.py   | 174 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 161 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 9fc9865..e44cdff 100644
--- a/README.md
+++ b/README.md
@@ -13,4 +13,5 @@ For building `opendetex` (which is a necessary dependency), you will need
 
 ## Usage
 
-`./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
+* `./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
+* `./main.py arxiv_eprint_id` to get a list of DOIs associated to each reference from the provided arXiv eprint.
diff --git a/main.py b/main.py
index 94914f4..1e2613a 100755
--- a/main.py
+++ b/main.py
@@ -1,10 +1,24 @@
 #!/usr/bin/env python3
+import io
 import math
 import os
 import re
 import requests
 import subprocess
 import sys
+import tarfile
+
+
+regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+regex_bibitems = re.compile(r"\\bibitem\{.+?\}")
+regex_endthebibliography = re.compile(r"\\end\{thebibliography}")
+
+
+def clean_whitespaces(text):
+    """
+    Remove double whitespaces and trailing . and , from text.
+    """
+    return ' '.join(text.strip().rstrip(".,").split())
 
 
 def oa_from_doi(doi):
@@ -18,34 +32,103 @@ def oa_from_doi(doi):
 def clean_bibitem(bibitem):
     """
     Return a plaintext representation of the bibitem from the bbl file.
+
+    Params:
+        - bibitem is the text content of the bibitem.
+
+    Returns a cleaned plaintext citation from the bibitem.
     """
     script_dir = os.path.dirname(os.path.abspath(__file__))
-    output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"],
+    output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
+                                      "-s"],
                                      input=bibitem.encode("utf-8"))
     output = output.decode("utf-8")
-    output = ' '.join(output.strip().rstrip(".,").split())
+    output = clean_whitespaces(output)
     return output
 
 
-def parse_bbl(bbl_file):
-    with open(bbl_file, 'r') as fh:
-        bbl_content = fh.read()
-    bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:]
-    bibitems = [re.sub(r"\\end\{thebibliography}",
-                       "",
-                       i).strip() for i in bibitems]
+def parse_bbl(bbl):
+    """
+    Parse a *.bbl file to get a clean list of plaintext citations.
+
+    Params:
+        - bbl is either the path to the .bbl file or the content of a bbl file.
+
+    Returns a list of cleaned plaintext citations.
+    """
+    # Handle path or content
+    if os.path.isfile(bbl):
+        with open(bbl, 'r') as fh:
+            bbl_content = fh.read()
+    else:
+        bbl_content = bbl
+    # Get a list of bibitems
+    bibitems = regex_bibitems.split(bbl_content)[1:]
+    bibitems = [regex_endthebibliography.sub("",
+                                             i).strip() for i in bibitems]
     cleaned_bbl = []
+    # Clean every bibitem
     for bibitem in bibitems:
         cleaned_bbl.append(clean_bibitem(bibitem))
     return cleaned_bbl
 
 
-def dois_from_bbl(bbl_file):
+def extract_doi_links(urls):
+    """
+    Try to find a DOI from a given list of URLs.
+    """
+    doi_urls = [url for url in urls if "/doi/" in url]
+    if len(doi_urls) > 0:
+        return ("http://dx.doi.org" +
+                doi_urls[0][doi_urls[0].find("/doi/") + 4:])
+    else:
+        return None
+
+
+def extract_arxiv_links(urls):
+    """
+    Try to find an arXiv link from a given list of URLs.
+    """
+    arxiv_urls = [url for url in urls if "://arxiv.org" in url]
+    if len(arxiv_urls) > 0:
+        return arxiv_urls[0]
+    else:
+        return None
+
+
+def dois_from_bbl(bbl):
     """
     Get the papers cited by the paper identified by the given DOI.
+
+    Params:
+        - bbl is either the path to the .bbl file or the content of a bbl file.
+
+    Returns a dict of cleaned plaintext citations and their associated doi.
     """
-    cleaned_citations = parse_bbl(bbl_file)
+    cleaned_citations_with_URLs = parse_bbl(bbl)
     dois = {}
+    cleaned_citations = []
+    # Try to get the DOI directly from the citation
+    for citation in cleaned_citations_with_URLs[:]:
+        # Get all the urls in the citation
+        raw_urls = regex_urls.findall(citation)
+        urls = [u.lower() for u in raw_urls]
+        # Remove URLs in citation
+        for url in raw_urls:
+            citation = citation.replace(url, "")
+        citation = clean_whitespaces(citation)
+        # Try to find an arXiv link
+        arxiv_url = extract_arxiv_links(urls)
+        if arxiv_url:
+            dois[citation] = arxiv_url
+        # Try to find a DOI link
+        doi_url = extract_doi_links(urls)
+        if doi_url:
+            dois[citation] = doi_url
+        # If no match found, stack it for next step
+        if citation not in dois:
+            cleaned_citations.append(citation)
+    # Do batch of 10 papers, to prevent from the timeout of crossref
     for i in range(math.ceil(len(cleaned_citations) / 10)):
         lower_bound = 10 * i
         upper_bound = min(10 * (i + 1), len(cleaned_citations))
@@ -53,15 +136,76 @@ def dois_from_bbl(bbl_file):
                           json=cleaned_citations[lower_bound:upper_bound])
         for result in r.json()["results"]:
             if "doi" not in result:
+                # If DOI is not found, try a direct query to get a DOI
+                # r = requests.get("http://search.crossref.org/dois",
+                #                  params={
+                #                      'q': result["text"],
+                #                      "sort": "score",
+                #                      "rows": 1
+                #                  })
+                # doi_result = r.json()
+                # if len(doi_result) > 0:
+                #     dois[result["text"]] = doi_result[0]["doi"]
+                # else:
+                #     dois[result["text"]] = None
                 dois[result["text"]] = None
             else:
                 dois[result["text"]] = result["doi"]
     return dois
 
 
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.")
+def sources_from_arxiv(eprint):
+    """
+    Download sources on arXiv for a given preprint.
 
+    Params:
+        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
+
+    Returns a TarFile object of the sources of the arXiv preprint.
+    """
+    r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
+    file_object = io.BytesIO(r.content)
+    return tarfile.open(fileobj=file_object)
+
+
+def bbl_from_arxiv(eprint):
+    """
+    Get the .bbl files (if any) of a given preprint.
+
+    Params:
+        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
+
+    Returns a list of the .bbl files as text (if any) or None.
+    """
+    tf = sources_from_arxiv(eprint)
+    bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
+    bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
+                 for member in bbl_files]
+    return bbl_files
+
+
+def dois_from_arxiv(eprint):
+    """
+    Get the .bbl files (if any) of a given preprint.
+
+    Params:
+        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
+
+    Returns a dict of cleaned plaintext citations and their associated doi.
+    """
+    bbl_files = bbl_from_arxiv(eprint)
+    dois = {}
+    for bbl in bbl_files:
+        dois.update(dois_from_bbl(bbl))
+    return dois
+
+
+if __name__ == "__main__":
     import pprint
-    pprint.pprint(dois_from_bbl(sys.argv[1]))
+    if len(sys.argv) < 2:
+        sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
+
+    if os.path.isfile(sys.argv[1]):
+        pprint.pprint(dois_from_bbl(sys.argv[1]))
+    else:
+        pprint.pprint(dois_from_arxiv(sys.argv[1]))