From 97eb5a3ae0faa80b6d14bb3b75226a4f0aef12cb Mon Sep 17 00:00:00 2001
From: "Phyks (Lucas Verney)" <phyks@phyks.me>
Date: Sun, 27 Dec 2015 19:35:55 +0100
Subject: [PATCH] First commit

---
 .gitignore                   |   1 +
 .gitmodules                  |   6 +
 libbmc/__init__.py           |   0
 libbmc/doi.py                | 113 +++++++++++++
 libbmc/external/opendetex    |   1 +
 libbmc/external/poppler      |   1 +
 libbmc/isbn.py               |  55 ++++++
 libbmc/repositories/arxiv.py | 316 +++++++++++++++++++++++++++++++++++
 libbmc/tools.py              |  37 ++++
 requirements.txt             |   2 +
 10 files changed, 532 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 libbmc/__init__.py
 create mode 100644 libbmc/doi.py
 create mode 160000 libbmc/external/opendetex
 create mode 160000 libbmc/external/poppler
 create mode 100644 libbmc/isbn.py
 create mode 100644 libbmc/repositories/arxiv.py
 create mode 100644 libbmc/tools.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..4603d9f
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "libbmc/external/opendetex"]
+	path = libbmc/external/opendetex
+	url = https://github.com/Phyks/opendetex
+[submodule "libbmc/external/poppler"]
+	path = libbmc/external/poppler
+	url = git://git.freedesktop.org/git/poppler/poppler
diff --git a/libbmc/__init__.py b/libbmc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/libbmc/doi.py b/libbmc/doi.py
new file mode 100644
index 0000000..70860df
--- /dev/null
+++ b/libbmc/doi.py
@@ -0,0 +1,113 @@
+"""
+This file contains all the DOI-related functions.
+"""
+import re
+import requests
+
+from libbmc import tools
+
+# Taken from
+# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
+regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
+                   re.IGNORECASE)
+# Base dx.doi.org URL for redirections
+dx_url = "http://dx.doi.org/{doi}"
+
+
+def is_valid(doi):
+    """
+    Check that a given DOI is a valid canonical DOI.
+
+    :param doi: The DOI to be checked.
+    :returns: Boolean indicating whether the DOI is valid or not.
+    """
+    match = regex.match(doi)
+    return ((match is not None) and (match.group(0) == doi))
+
+
+def extract_from_text(text):
+    """
+    Extract canonical DOIs from a text.
+
+    :param text: The text to extract DOIs from.
+    :returns: A list of found DOIs.
+    """
+    return tools.remove_duplicates(regex.findall(text))
+
+
+def to_URL(dois):
+    """
+    Convert a list of canonical DOIs to a list of DOIs URLs.
+
+    :param dois: List of canonical DOIs.
+    :returns: A list of DOIs URLs.
+    """
+    if isinstance(dois, list):
+        return [dx_url.format(doi=doi) for doi in dois]
+    else:
+        return dx_url.format(doi=dois)
+
+
+def to_canonical(urls):
+    """
+    Convert a list of DOIs URLs to a list of canonical DOIs.
+
+    :param dois: A list of DOIs URLs.
+    :returns: List of canonical DOIs.
+    """
+    if isinstance(urls, list):
+        return [extract_from_text(url) for url in urls]
+    else:
+        return extract_from_text(urls)
+
+
+def get_oa_version(doi):
+    """
+    Get an OA version for a given DOI.
+
+    .. note::
+
+        Uses beta.dissem.in API.
+
+    :param doi: A canonical DOI.
+    :returns: The URL of the OA version of the given DOI, or ``None``.
+    """
+    # If DOI is a link, truncate it
+    r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
+    try:
+        assert(r.status_code == requests.codes.ok)
+        result = r.json()
+        assert(result["status"] == "ok")
+        return result["paper"]["pdf_url"]
+    except (AssertionError, ValueError, KeyError):
+        return None
+
+
+def get_linked_version(doi):
+    """
+    Get the original link behind the DOI.
+
+    :param doi: A canonical DOI.
+    :returns: The canonical URL behind the DOI, or ``None``.
+    """
+    r = requests.head(to_URL(doi))
+    return r.headers.get("location")
+
+
+def get_bibtex(doi):
+    """
+    Get a BibTeX entry for a given DOI.
+
+    .. note::
+
+        Adapted from https://gist.github.com/jrsmith3/5513926.
+
+    :param doi: The canonical DOI to get BibTeX from.
+    :returns: A BibTeX string or ``None``.
+    """
+    r = requests.get(to_URL(doi),
+                     headers={"accept": "application/x-bibtex"})
+    if r.headers.get("content-type") == "application/x-bibtex":
+        return r.text
+    else:
+        return None
diff --git a/libbmc/external/opendetex b/libbmc/external/opendetex
new file mode 160000
index 0000000..b980e67
--- /dev/null
+++ b/libbmc/external/opendetex
@@ -0,0 +1 @@
+Subproject commit b980e6764279df32acd0f693a163d2040f1166b7
diff --git a/libbmc/external/poppler b/libbmc/external/poppler
new file mode 160000
index 0000000..b3425dd
--- /dev/null
+++ b/libbmc/external/poppler
@@ -0,0 +1 @@
+Subproject commit b3425dd3261679958cd56c0f71995c15d2124433
diff --git a/libbmc/isbn.py b/libbmc/isbn.py
new file mode 100644
index 0000000..ce56322
--- /dev/null
+++ b/libbmc/isbn.py
@@ -0,0 +1,55 @@
+"""
+This file contains all the ISBN-related functions.
+"""
+import isbnlib
+
+from libbmc import doi
+
+
+def is_valid(isbn):
+    """
+    Check that a given string is a valid ISBN.
+
+    :param isbn: the isbn to be checked.
+    :returns: boolean indicating whether the isbn is valid or not.
+
+    """
+    return not isbnlib.notisbn(isbn)
+
+
+def extract_from_text(text):
+    """
+    Extract ISBNs from a text.
+
+    :param text: Some text.
+    :returns: A list of canonical ISBNs found in the text.
+    """
+    return [isbnlib.get_canonical_isbn(isbn)
+            for isbn in isbnlib.get_isbnlike(text)]
+
+
+def get_bibtex(isbn):
+    """
+    Get a BibTeX string for the given ISBN.
+
+    :param isbn: ISBN to fetch BibTeX entry for.
+    :returns: A BibTeX string.
+    """
+    return doi.get_bibtex(to_doi(isbn))
+
+
+def to_doi(isbn):
+    """
+    Try to fetch a DOI from a given ISBN.
+
+    :param isbn: A valid ISBN string.
+    :returns: A DOI as string.
+    """
+    return isbnlib.doi(isbn)
+
+
+def from_doi(doi):
+    """
+    TODO
+    """
+    assert(False)
diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py
new file mode 100644
index 0000000..0b6525c
--- /dev/null
+++ b/libbmc/repositories/arxiv.py
@@ -0,0 +1,316 @@
+"""
+This file contains all the arXiv-related functions.
+"""
+import io
+import re
+import requests
+import tarfile
+import xml.etree.ElementTree
+
+from libbmc import tools
+
+
+arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
+arxiv_identifier_before_2007 = r"(" + ("|".join([
+    "astro-ph.GA",
+    "astro-ph.CO",
+    "astro-ph.EP",
+    "astro-ph.HE",
+    "astro-ph.IM",
+    "astro-ph.SR",
+    "cond-math.dis-nn",
+    "cond-math.mtrl-sci",
+    "cond-math.mes-hall",
+    "cond-math.other",
+    "cond-math.quant-gas",
+    "cond-math.soft",
+    "cond-math.stat-mech",
+    "cond-math.str-el",
+    "cond-math.supr-con",
+    "gr-qc",
+    "hep-ex",
+    "hep-lat",
+    "hep-ph",
+    "hep-th",
+    "math-ph",
+    "nlin.AO",
+    "nlin.CG",
+    "nlin.CD",
+    "nlin.SI",
+    "nlin.PS",
+    "nucl-ex",
+    "nucl-th",
+    "physics.acc-ph",
+    "physics.ao-ph",
+    "physics.atom-ph",
+    "physics.atm-clus",
+    "physics.bio-ph",
+    "physics.chem-ph",
+    "physics.class-ph",
+    "physics.comp-ph",
+    "physics.data-an",
+    "physics.flu-dyn",
+    "physics.gen-ph",
+    "physics.geo-ph",
+    "physics.hist-ph",
+    "physics.ins-det",
+    "physics.med-ph",
+    "physics.optics",
+    "physics.ed-ph",
+    "physics.soc-ph",
+    "physics.plasm-ph",
+    "physics.pop-ph",
+    "physics.space-ph",
+    "physics.quant-ph",
+    "math.AG",
+    "math.AT",
+    "math.AP",
+    "math.CT",
+    "math.CA",
+    "math.CO",
+    "math.AC",
+    "math.CV",
+    "math.DG",
+    "math.DS",
+    "math.FA",
+    "math.GM",
+    "math.GN",
+    "math.GT",
+    "math.GR",
+    "math.HO",
+    "math.IT",
+    "math.KT",
+    "math.LO",
+    "math.MP",
+    "math.MG",
+    "math.NT",
+    "math.NA",
+    "math.OA",
+    "math.OC",
+    "math.PR",
+    "math.QA",
+    "math.RT",
+    "math.RA",
+    "math.SP",
+    "math.ST",
+    "math.SG",
+    "cs.AI",
+    "cs.CL",
+    "cs.CC",
+    "cs.CE",
+    "cs.CG",
+    "cs.GT",
+    "cs.CV",
+    "cs.CY",
+    "cs.CR",
+    "cs.DS",
+    "cs.DB",
+    "cs.DL",
+    "cs.DM",
+    "cs.DC",
+    "cs.ET",
+    "cs.FL",
+    "cs.GL",
+    "cs.GR",
+    "cs.AR",
+    "cs.HC",
+    "cs.IR",
+    "cs.IT",
+    "cs.LG",
+    "cs.LO",
+    "cs.MS",
+    "cs.MA",
+    "cs.MM",
+    "cs.NI",
+    "cs.NE",
+    "cs.NA",
+    "cs.OS",
+    "cs.OH",
+    "cs.PF",
+    "cs.PL",
+    "cs.RO",
+    "cs.SI",
+    "cs.SE",
+    "cs.SD",
+    "cs.SC",
+    "cs.SY",
+    "q-bio.BM",
+    "q-bio.CB",
+    "q-bio.GN",
+    "q-bio.MN",
+    "q-bio.NC",
+    "q-bio.OT",
+    "q-bio.PE",
+    "q-bio.QM",
+    "q-bio.SC",
+    "q-bio.TO",
+    "q-fin.CP",
+    "q-fin.EC",
+    "q-fin.GN",
+    "q-fin.MF",
+    "q-fin.PM",
+    "q-fin.PR",
+    "q-fin.RM",
+    "q-fin.ST",
+    "q-fin.TR",
+    "stat.AP",
+    "stat.CO",
+    "stat.ML",
+    "stat.ME",
+    "stat.OT",
+    "stat.TH"])) + r")/\d+"
+regex = re.compile(
+    "(" + arxiv_identifier_from_2007 + ")|(" +
+    arxiv_identifier_before_2007 + ")",
+    re.IGNORECASE)
+
+# Base arXiv URL used as id sometimes
+arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
+# Eprint URL used to download sources
+arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
+
+
+def is_valid(arxiv_id):
+    """
+    Check that a given arXiv ID is a valid one.
+
+    :param arxiv_id: The arXiv ID to be checked.
+    :returns: Boolean indicating whether the arXiv ID is valid or not.
+    """
+    match = regex.match(arxiv_id)
+    return ((match is not None) and (match.group(0) == arxiv_id))
+
+
+def get_bibtex(arxiv_id):
+    """
+    TODO
+    """
+    assert(False)
+
+
+def extract_from_text(text):
+    """
+    Extract arXiv IDs from a text.
+
+    :param text: The text to extract arXiv IDs from.
+    :returns: A list of matching arXiv IDs.
+    """
+    return tools.remove_duplicates(regex.findall(text))
+
+
+def to_URL(arxiv_ids):
+    """
+    Convert a list of canonical DOIs to a list of DOIs URLs.
+
+    :param dois: List of canonical DOIs.
+    :returns: A list of DOIs URLs.
+    """
+    if isinstance(arxiv_ids, list):
+        return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
+    else:
+        return arxiv_url.format(arxiv_id=arxiv_ids)
+
+
+def to_canonical(urls):
+    """
+    Convert a list of DOIs URLs to a list of canonical DOIs.
+
+    :param dois: A list of DOIs URLs.
+    :returns: List of canonical DOIs.
+    """
+    if isinstance(urls, list):
+        return [extract_from_text(url) for url in urls]
+    else:
+        return extract_from_text(urls)
+
+
+def from_doi(doi):
+    """
+    Get the arXiv eprint id for a given DOI.
+
+    .. note::
+
+        Uses arXiv API. Will not return anything if arXiv is not aware of the
+        associated DOI.
+
+    :param doi: The DOI of the resource to look for.
+    :returns: The arXiv eprint id, or ``None`` if not found.
+    """
+    r = requests.get("http://export.arxiv.org/api/query",
+                     params={
+                         "search_query": "doi:%s" % (doi,),
+                         "max_results": 1
+                     })
+    e = xml.etree.ElementTree.fromstring(r.content)
+    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
+        id = entry.find("{http://www.w3.org/2005/Atom}id").text
+        # id is an arXiv full URL. We only want the id which is the last URL
+        # component.
+        return id.split("/")[-1]
+    return None
+
+
+def to_doi(arxiv_id):
+    """
+    Get the associated DOI for a given arXiv eprint.
+
+    .. note::
+
+        Uses arXiv API. Will not return anything if arXiv is not aware of the
+        associated DOI.
+
+    :param eprint: The arXiv eprint id.
+    :returns: The DOI if any, or ``None``.
+    """
+    r = requests.get("http://export.arxiv.org/api/query",
+                     params={
+                         "id_list": arxiv_id,
+                         "max_results": 1
+                     })
+    e = xml.etree.ElementTree.fromstring(r.content)
+    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
+        doi = entry.find("{http://arxiv.org/schemas/atom}doi")
+        if doi is not None:
+            return doi.text
+    return None
+
+
+def get_sources(arxiv_id):
+    """
+    Download sources on arXiv for a given preprint.
+
+    :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
+            canonical form.
+    :returns: A ``TarFile`` object of the sources of the arXiv preprint or \
+            ``None``.
+    """
+    r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
+    try:
+        assert(r.status_code == requests.codes.ok)
+        file_object = io.BytesIO(r.content)
+        return tarfile.open(fileobj=file_object)
+    except (AssertionError, tarfile.TarError):
+        return None
+
+
+def get_bbl(arxiv_id):
+    """
+    Get the .bbl files (if any) of a given preprint.
+
+    :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
+            canonical form.
+    :returns: A list of the full text of the ``.bbl`` files (if any) \
+            or ``None``.
+    """
+    tf = get_sources(arxiv_id)
+    bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
+    bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
+                 for member in bbl_files]
+    return bbl_files
+
+
+def get_citations(arxiv_id):
+    """
+    TODO
+    """
+    assert(False)
diff --git a/libbmc/tools.py b/libbmc/tools.py
new file mode 100644
index 0000000..0088d3d
--- /dev/null
+++ b/libbmc/tools.py
@@ -0,0 +1,37 @@
+"""
+This file contains various utility functions.
+"""
+
+
+def replaceAll(text, replace_dict):
+    """
+    Replace multiple strings in a text.
+
+    :param text: Text to replace in.
+    :param replace_dict: Dictionary mapping strings to replace with their \
+            substitution.
+    :returns: Text after replacements.
+    """
+    for i, j in replace_dict.items():
+        text = text.replace(i, j)
+    return text
+
+
+def clean_whitespaces(text):
+    """
+    Remove multiple whitespaces from text.
+
+    :param text: Text to remove multiple whitespaces from.
+    :returns: A cleaned text.
+    """
+    return ' '.join(text.strip().split())
+
+
+def remove_duplicates(some_list):
+    """
+    Remove the duplicates from a list.
+
+    :param some_list: List to remove duplicates from.
+    :returns: A list without duplicates.
+    """
+    return list(set(some_list))
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c721aa3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+isbnlib==3.5.7
+requests==2.9.1