From 51df30918a92b6a59a35c8ab2767094a2e81e817 Mon Sep 17 00:00:00 2001
From: "Phyks (Lucas Verney)" <phyks@phyks.me>
Date: Wed, 23 Dec 2015 22:49:14 +0100
Subject: [PATCH] Clean a bit the code

---
 .gitignore |   1 +
 arxiv.py   |  50 ++++++++++
 bbl.py     | 124 ++++++++++++++++++++++++
 doi.py     |  77 +++++++++++++++
 main.py    | 276 +----------------------------------------------------
 regex.py   |  14 +++
 tools.py   |  12 +++
 7 files changed, 283 insertions(+), 271 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 arxiv.py
 create mode 100644 bbl.py
 create mode 100644 doi.py
 create mode 100644 regex.py
 create mode 100644 tools.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/arxiv.py b/arxiv.py
new file mode 100644
index 0000000..f9dc958
--- /dev/null
+++ b/arxiv.py
@@ -0,0 +1,50 @@
+import bbl
+import io
+import requests
+import tarfile
+
+
+def sources_from_arxiv(eprint):
+    """
+    Download sources on arXiv for a given preprint.
+
+    Params:
+        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
+
+    Returns a TarFile object of the sources of the arXiv preprint.
+    """
+    r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
+    file_object = io.BytesIO(r.content)
+    return tarfile.open(fileobj=file_object)
+
+
+def bbl_from_arxiv(eprint):
+    """
+    Get the .bbl files (if any) of a given preprint.
+
+    Params:
+        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
+
+    Returns a list of the .bbl files as text (if any) or None.
+    """
+    tf = sources_from_arxiv(eprint)
+    bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
+    bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
+                 for member in bbl_files]
+    return bbl_files
+
+
+def get_dois(eprint):
+    """
+    Get the .bbl files (if any) of a given preprint.
+
+    Params:
+        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
+
+    Returns a dict of cleaned plaintext citations and their associated doi.
+    """
+    bbl_files = bbl_from_arxiv(eprint)
+    dois = {}
+    for bbl_file in bbl_files:
+        dois.update(bbl.get_dois(bbl_file))
+    return dois
diff --git a/bbl.py b/bbl.py
new file mode 100644
index 0000000..97ace12
--- /dev/null
+++ b/bbl.py
@@ -0,0 +1,124 @@
+import doi
+import math
+import os
+import requests
+import subprocess
+
+import regex
+import tools
+
+
+def clean_bibitem(bibitem):
+    """
+    Return a plaintext representation of the bibitem from the bbl file.
+
+    Params:
+        - bibitem is the text content of the bibitem.
+
+    Returns a cleaned plaintext citation from the bibitem.
+    """
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
+                                      "-s"],
+                                     input=bibitem.encode("utf-8"))
+    output = output.decode("utf-8")
+    output = tools.clean_whitespaces(output)
+    return output
+
+
+def parse(bbl):
+    """
+    Parse a *.bbl file to get a clean list of plaintext citations.
+
+    Params:
+        - bbl is either the path to the .bbl file or the content of a bbl file.
+
+    Returns a list of cleaned plaintext citations.
+    """
+    # Handle path or content
+    if os.path.isfile(bbl):
+        with open(bbl, 'r') as fh:
+            bbl_content = fh.read()
+    else:
+        bbl_content = bbl
+    # Get a list of bibitems
+    bibitems = regex.bibitems.split(bbl_content)[1:]
+    bibitems = [regex.endthebibliography.sub("",
+                                             i).strip() for i in bibitems]
+    cleaned_bbl = []
+    # Clean every bibitem
+    for bibitem in bibitems:
+        cleaned_bbl.append(clean_bibitem(bibitem))
+    return cleaned_bbl
+
+
+def get_dois(bbl_input):
+    """
+    Get the papers cited by the paper identified by the given DOI.
+
+    Params:
+        - bbl_input is either the path to the .bbl file or the content of a bbl
+        file.
+
+    Returns a dict of cleaned plaintext citations and their associated doi.
+    """
+    cleaned_citations_with_URLs = parse(bbl_input)
+    dois = {}
+    cleaned_citations = []
+    # Try to get the DOI directly from the citation
+    for citation in cleaned_citations_with_URLs[:]:
+        # Get all the urls in the citation
+        raw_urls = regex.urls.findall(citation)
+        urls = [u.lower() for u in raw_urls]
+        # Remove URLs in citation
+        for url in raw_urls:
+            citation = citation.replace(url, "")
+        citation = tools.clean_whitespaces(citation)
+        # Try to find an arXiv link
+        arxiv_url = doi.extract_arxiv_links(urls)
+        if arxiv_url:
+            dois[citation] = arxiv_url
+        # Try to find a DOI link
+        doi_url = doi.extract_doi_links(urls)
+        if doi_url:
+            dois[citation] = doi_url
+        # Try to find a direct match using a regex if links search failed
+        if not doi_url and not arxiv_url:
+            regex.match = doi.match_doi_or_arxiv(citation)
+            if regex.match:
+                print(regex.match)
+                citation = citation.replace(regex.match[1], "")
+                if regex.match[0] == "DOI":
+                    dois[citation] = "http://dx.doi.org/%s" % (regex.match[1],)
+                else:
+                    dois[citation] = (
+                        "http://arxiv.org/abs/%s" %
+                        (regex.match[1].replace("arxiv:", ""),)
+                    )
+        # If no match found, stack it for next step
+        if citation not in dois:
+            cleaned_citations.append(citation)
+    # Do batch of 10 papers, to prevent from the timeout of crossref
+    for i in range(math.ceil(len(cleaned_citations) / 10)):
+        lower_bound = 10 * i
+        upper_bound = min(10 * (i + 1), len(cleaned_citations))
+        r = requests.post("http://search.crossref.org/links",
+                          json=cleaned_citations[lower_bound:upper_bound])
+        for result in r.json()["results"]:
+            if "doi" not in result:
+                # If DOI is not found, try a direct query to get a DOI
+                # r = requests.get("http://search.crossref.org/dois",
+                #                  params={
+                #                      'q': result["text"],
+                #                      "sort": "score",
+                #                      "rows": 1
+                #                  })
+                # doi_result = r.json()
+                # if len(doi_result) > 0:
+                #     dois[result["text"]] = doi_result[0]["doi"]
+                # else:
+                #     dois[result["text"]] = None
+                dois[result["text"]] = None
+            else:
+                dois[result["text"]] = result["doi"]
+    return dois
diff --git a/doi.py b/doi.py
new file mode 100644
index 0000000..76c69a6
--- /dev/null
+++ b/doi.py
@@ -0,0 +1,77 @@
+import regex
+import tools
+
+
+def extract_doi_links(urls):
+    """
+    Try to find a DOI from a given list of URLs.
+    """
+    doi_urls = [url for url in urls if "/doi/" in url]
+    if len(doi_urls) > 0:
+        return ("http://dx.doi.org" +
+                doi_urls[0][doi_urls[0].find("/doi/") + 4:])
+    else:
+        return None
+
+
+def extract_arxiv_links(urls):
+    """
+    Try to find an arXiv link from a given list of URLs.
+    """
+    arxiv_urls = [url for url in urls if "://arxiv.org" in url]
+    if len(arxiv_urls) > 0:
+        return arxiv_urls[0]
+    else:
+        return None
+
+
+def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
+    """
+    Search for a valid article ID (DOI or ArXiv) in the given text
+    (regex-based).
+
+    Returns a tuple (type, first matching ID) or None if not found.
+    From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
+    and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
+    """
+    text = text.lower()
+    # Try to extract DOI
+    if "DOI" in only:
+        extractID = regex.doi.search(text.replace('&#338;', '-'))
+        if not extractID:
+            # PNAS fix
+            extractID = regex.doi_pnas.search(text.
+                                              replace('pnas', '/pnas'))
+            if not extractID:
+                # JSB fix
+                extractID = regex.doi_jsb.search(text)
+        if extractID:
+            # If DOI extracted, clean it and return it
+            cleanDOI = False
+            cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
+            if regex.clean_doi.search(cleanDOI):
+                cleanDOI = cleanDOI[1:]
+            # FABSE J fix
+            if regex.clean_doi_fabse.search(cleanDOI):
+                cleanDOI = cleanDOI[:20]
+            # Second JCB fix
+            if regex.clean_doi_jcb.search(cleanDOI):
+                cleanDOI = cleanDOI[:21]
+            if len(cleanDOI) > 40:
+                cleanDOItemp = regex.clean_doi_len.sub('000', cleanDOI)
+                reps = {'.': 'A', '-': '0'}
+                cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
+                digitStart = 0
+                for i in range(len(cleanDOItemp)):
+                    if cleanDOItemp[i].isdigit():
+                        digitStart = 1
+                        if cleanDOItemp[i].isalpha() and digitStart:
+                            break
+                cleanDOI = cleanDOI[0:(8+i)]
+            return ("DOI", cleanDOI)
+    # Else, try to extract arXiv
+    if "arXiv" in only:
+        extractID = regex.arXiv.search(text)
+        if extractID:
+            return ("arXiv", extractID.group(1))
+    return None
diff --git a/main.py b/main.py
index ec08aac..f2fd229 100755
--- a/main.py
+++ b/main.py
@@ -1,40 +1,10 @@
 #!/usr/bin/env python3
-import io
-import math
 import os
-import re
-import requests
-import subprocess
 import sys
-import tarfile
 
-
-regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
-regex_bibitems = re.compile(r"\\bibitem\{.+?\}")
-regex_endthebibliography = re.compile(r"\\end\{thebibliography}")
-
-regex_doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
-regex_doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
-regex_doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
-regex_clean_doi = re.compile('^/')
-regex_clean_doi_fabse = re.compile('^10.1096')
-regex_clean_doi_jcb = re.compile('^10.1083')
-regex_clean_doi_len = re.compile(r'\d\.\d')
-regex_arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
-
-
-def replaceAll(text, dic):
-    """Replace all the dic keys by the associated item in text"""
-    for i, j in dic.items():
-        text = text.replace(i, j)
-    return text
-
-
-def clean_whitespaces(text):
-    """
-    Remove double whitespaces and trailing . and , from text.
-    """
-    return ' '.join(text.strip().rstrip(".,").split())
+# Local import
+import arxiv
+import bbl
 
 
 def oa_from_doi(doi):
@@ -45,248 +15,12 @@ def oa_from_doi(doi):
     pass
 
 
-def clean_bibitem(bibitem):
-    """
-    Return a plaintext representation of the bibitem from the bbl file.
-
-    Params:
-        - bibitem is the text content of the bibitem.
-
-    Returns a cleaned plaintext citation from the bibitem.
-    """
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
-                                      "-s"],
-                                     input=bibitem.encode("utf-8"))
-    output = output.decode("utf-8")
-    output = clean_whitespaces(output)
-    return output
-
-
-def parse_bbl(bbl):
-    """
-    Parse a *.bbl file to get a clean list of plaintext citations.
-
-    Params:
-        - bbl is either the path to the .bbl file or the content of a bbl file.
-
-    Returns a list of cleaned plaintext citations.
-    """
-    # Handle path or content
-    if os.path.isfile(bbl):
-        with open(bbl, 'r') as fh:
-            bbl_content = fh.read()
-    else:
-        bbl_content = bbl
-    # Get a list of bibitems
-    bibitems = regex_bibitems.split(bbl_content)[1:]
-    bibitems = [regex_endthebibliography.sub("",
-                                             i).strip() for i in bibitems]
-    cleaned_bbl = []
-    # Clean every bibitem
-    for bibitem in bibitems:
-        cleaned_bbl.append(clean_bibitem(bibitem))
-    return cleaned_bbl
-
-
-def extract_doi_links(urls):
-    """
-    Try to find a DOI from a given list of URLs.
-    """
-    doi_urls = [url for url in urls if "/doi/" in url]
-    if len(doi_urls) > 0:
-        return ("http://dx.doi.org" +
-                doi_urls[0][doi_urls[0].find("/doi/") + 4:])
-    else:
-        return None
-
-
-def extract_arxiv_links(urls):
-    """
-    Try to find an arXiv link from a given list of URLs.
-    """
-    arxiv_urls = [url for url in urls if "://arxiv.org" in url]
-    if len(arxiv_urls) > 0:
-        return arxiv_urls[0]
-    else:
-        return None
-
-
-def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
-    """
-    Search for a valid article ID (DOI or ArXiv) in the given text
-    (regex-based).
-
-    Returns a tuple (type, first matching ID) or None if not found.
-    From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
-    and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
-    """
-    text = text.lower()
-    # Try to extract DOI
-    if "DOI" in only:
-        extractID = regex_doi.search(text.replace('&#338;', '-'))
-        if not extractID:
-            # PNAS fix
-            extractID = regex_doi_pnas.search(text.
-                                              replace('pnas', '/pnas'))
-            if not extractID:
-                # JSB fix
-                extractID = regex_doi_jsb.search(text)
-        if extractID:
-            # If DOI extracted, clean it and return it
-            cleanDOI = False
-            cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
-            if regex_clean_doi.search(cleanDOI):
-                cleanDOI = cleanDOI[1:]
-            # FABSE J fix
-            if regex_clean_doi_fabse.search(cleanDOI):
-                cleanDOI = cleanDOI[:20]
-            # Second JCB fix
-            if regex_clean_doi_jcb.search(cleanDOI):
-                cleanDOI = cleanDOI[:21]
-            if len(cleanDOI) > 40:
-                cleanDOItemp = regex_clean_doi_len.sub('000', cleanDOI)
-                reps = {'.': 'A', '-': '0'}
-                cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
-                digitStart = 0
-                for i in range(len(cleanDOItemp)):
-                    if cleanDOItemp[i].isdigit():
-                        digitStart = 1
-                        if cleanDOItemp[i].isalpha() and digitStart:
-                            break
-                cleanDOI = cleanDOI[0:(8+i)]
-            return ("DOI", cleanDOI)
-    # Else, try to extract arXiv
-    if "arXiv" in only:
-        extractID = regex_arXiv.search(text)
-        if extractID:
-            return ("arXiv", extractID.group(1))
-    return None
-
-
-def dois_from_bbl(bbl):
-    """
-    Get the papers cited by the paper identified by the given DOI.
-
-    Params:
-        - bbl is either the path to the .bbl file or the content of a bbl file.
-
-    Returns a dict of cleaned plaintext citations and their associated doi.
-    """
-    cleaned_citations_with_URLs = parse_bbl(bbl)
-    dois = {}
-    cleaned_citations = []
-    # Try to get the DOI directly from the citation
-    for citation in cleaned_citations_with_URLs[:]:
-        # Get all the urls in the citation
-        raw_urls = regex_urls.findall(citation)
-        urls = [u.lower() for u in raw_urls]
-        # Remove URLs in citation
-        for url in raw_urls:
-            citation = citation.replace(url, "")
-        citation = clean_whitespaces(citation)
-        # Try to find an arXiv link
-        arxiv_url = extract_arxiv_links(urls)
-        if arxiv_url:
-            dois[citation] = arxiv_url
-        # Try to find a DOI link
-        doi_url = extract_doi_links(urls)
-        if doi_url:
-            dois[citation] = doi_url
-        # Try to find a direct match using a regex if links search failed
-        if not doi_url and not arxiv_url:
-            regex_match = match_doi_or_arxiv(citation)
-            if regex_match:
-                print(regex_match)
-                citation = citation.replace(regex_match[1], "")
-                if regex_match[0] == "DOI":
-                    dois[citation] = "http://dx.doi.org/%s" % (regex_match[1],)
-                else:
-                    dois[citation] = (
-                        "http://arxiv.org/abs/%s" %
-                        (regex_match[1].replace("arxiv:", ""),)
-                    )
-        # If no match found, stack it for next step
-        if citation not in dois:
-            cleaned_citations.append(citation)
-    # Do batch of 10 papers, to prevent from the timeout of crossref
-    for i in range(math.ceil(len(cleaned_citations) / 10)):
-        lower_bound = 10 * i
-        upper_bound = min(10 * (i + 1), len(cleaned_citations))
-        r = requests.post("http://search.crossref.org/links",
-                          json=cleaned_citations[lower_bound:upper_bound])
-        for result in r.json()["results"]:
-            if "doi" not in result:
-                # If DOI is not found, try a direct query to get a DOI
-                # r = requests.get("http://search.crossref.org/dois",
-                #                  params={
-                #                      'q': result["text"],
-                #                      "sort": "score",
-                #                      "rows": 1
-                #                  })
-                # doi_result = r.json()
-                # if len(doi_result) > 0:
-                #     dois[result["text"]] = doi_result[0]["doi"]
-                # else:
-                #     dois[result["text"]] = None
-                dois[result["text"]] = None
-            else:
-                dois[result["text"]] = result["doi"]
-    return dois
-
-
-def sources_from_arxiv(eprint):
-    """
-    Download sources on arXiv for a given preprint.
-
-    Params:
-        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
-
-    Returns a TarFile object of the sources of the arXiv preprint.
-    """
-    r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
-    file_object = io.BytesIO(r.content)
-    return tarfile.open(fileobj=file_object)
-
-
-def bbl_from_arxiv(eprint):
-    """
-    Get the .bbl files (if any) of a given preprint.
-
-    Params:
-        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
-
-    Returns a list of the .bbl files as text (if any) or None.
-    """
-    tf = sources_from_arxiv(eprint)
-    bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
-    bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
-                 for member in bbl_files]
-    return bbl_files
-
-
-def dois_from_arxiv(eprint):
-    """
-    Get the .bbl files (if any) of a given preprint.
-
-    Params:
-        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
-
-    Returns a dict of cleaned plaintext citations and their associated doi.
-    """
-    bbl_files = bbl_from_arxiv(eprint)
-    dois = {}
-    for bbl in bbl_files:
-        dois.update(dois_from_bbl(bbl))
-    return dois
-
-
 if __name__ == "__main__":
     import pprint
     if len(sys.argv) < 2:
         sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
 
     if os.path.isfile(sys.argv[1]):
-        pprint.pprint(dois_from_bbl(sys.argv[1]))
+        pprint.pprint(bbl.get_dois(sys.argv[1]))
     else:
-        pprint.pprint(dois_from_arxiv(sys.argv[1]))
+        pprint.pprint(arxiv.get_dois(sys.argv[1]))
diff --git a/regex.py b/regex.py
new file mode 100644
index 0000000..940f2de
--- /dev/null
+++ b/regex.py
@@ -0,0 +1,14 @@
+import re
+
+urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+bibitems = re.compile(r"\\bibitem\{.+?\}")
+endthebibliography = re.compile(r"\\end\{thebibliography}")
+
+doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
+doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
+doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
+clean_doi = re.compile('^/')
+clean_doi_fabse = re.compile('^10.1096')
+clean_doi_jcb = re.compile('^10.1083')
+clean_doi_len = re.compile(r'\d\.\d')
+arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
diff --git a/tools.py b/tools.py
new file mode 100644
index 0000000..c793256
--- /dev/null
+++ b/tools.py
@@ -0,0 +1,12 @@
+def replaceAll(text, dic):
+    """Replace all the dic keys by the associated item in text"""
+    for i, j in dic.items():
+        text = text.replace(i, j)
+    return text
+
+
+def clean_whitespaces(text):
+    """
+    Remove double whitespaces and trailing . and , from text.
+    """
+    return ' '.join(text.strip().rstrip(".,").split())