From c785e04589f4c9182fb62a9ae541689a97435185 Mon Sep 17 00:00:00 2001
From: "Phyks (Lucas Verney)" <phyks@phyks.me>
Date: Mon, 1 Feb 2016 17:32:24 +0100
Subject: [PATCH] Add a __valid_identifiers__ list to ease fetching of
 identifiers in papers

See the detailed explanations in README.md.

Also fixed some typos in docstrings.
---
 README.md                    | 15 +++++++++
 libbmc/__init__.py           | 14 ++++++---
 libbmc/doi.py                |  4 +++
 libbmc/isbn.py               |  5 +++
 libbmc/papers/identifiers.py | 61 +++++++++++++++++++++++-------------
 libbmc/papers/tearpages.py   |  2 +-
 libbmc/repositories/arxiv.py | 14 ++++++---
 7 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 66b6ae5..8dcb003 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,21 @@ install the matching software (`CERMINE`, `Grobid` or `pdf-extract`). See the
 docstrings of those functions for more infos on this particular point.
 
 
+## Note on `__valid_identifiers__`
+
+`libbmc` exposes a `__valid_identifiers__` list, containing the valid
+identifier types. These are those exposing the same function as `doi` or
+`isbn` modules, in particular the extraction from a string and BibTeX
+fetching functions.
+
+If you write additional modules for others repositories, you can include them
+in the `__valid_identifiers__` list, as long as they provide these functions.
+
+This list is especially useful for the `libbmc.papers.identifiers` module,
+which is using it to loop through all the available identifier types, to fetch
+for them in the paper and retrieve BibTeX from it.
+
+
 ## License
 
 This code is licensed under an MIT license.
diff --git a/libbmc/__init__.py b/libbmc/__init__.py
index 926387a..1946de6 100644
--- a/libbmc/__init__.py
+++ b/libbmc/__init__.py
@@ -1,9 +1,15 @@
-from . import bibtex, doi, fetcher, isbn
-from . import citations, papers, repositories
+# Global list of valid paper identifier types. See README.md.
+__valid_identifiers__ = []
 
-__version__ = "0.1"
+# Import order of the modules is important, as they will populate
+# `__valid_identifiers__` on load, and the order in this list reflects their
+# priority.
+from . import bibtex, doi, fetcher, isbn  # noqa
+from . import citations, papers, repositories  # noqa
+
+__version__ = "0.1.1"
 
 __all__ = [
     "bibtex", "doi", "fetcher", "isbn",
-    "citations", "papers", "repositories"
+    "citations", "papers", "repositories",
 ]
diff --git a/libbmc/doi.py b/libbmc/doi.py
index bfc5da8..3fee128 100644
--- a/libbmc/doi.py
+++ b/libbmc/doi.py
@@ -6,8 +6,12 @@ import requests
 
 from requests.exceptions import RequestException
 
+from libbmc import __valid_identifiers__
 from libbmc import tools
 
+# Append DOI to the valid identifiers list
+__valid_identifiers__ += ["doi"]
+
 # Taken from
 # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
 REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b",
diff --git a/libbmc/isbn.py b/libbmc/isbn.py
index 5c5904a..fe0e1c4 100644
--- a/libbmc/isbn.py
+++ b/libbmc/isbn.py
@@ -5,6 +5,11 @@ import isbnlib
 
 from libbmc import doi
 
+from libbmc import __valid_identifiers__
+
+# Append ISBN to the valid identifiers list
+__valid_identifiers__ += ["isbn"]
+
 
 def is_valid(isbn):
     """
diff --git a/libbmc/papers/identifiers.py b/libbmc/papers/identifiers.py
index 32e340f..abca898 100644
--- a/libbmc/papers/identifiers.py
+++ b/libbmc/papers/identifiers.py
@@ -6,10 +6,15 @@ Needs pdftotext and/or djvutxt installed on the machine.
 
 TODO: Unittests
 """
+import importlib
 import subprocess
+import sys
 
-from libbmc import doi, isbn
-from libbmc.repositories import arxiv, hal
+from libbmc import __valid_identifiers__
+
+# Import all the modules associated to __valid_identifiers__
+for type in __valid_identifiers__:
+    importlib.import_module("libbmc.%s" % (type,))
 
 
 def find_identifiers(src):
@@ -30,7 +35,7 @@ def find_identifiers(src):
 
     :params src: Path to the file to scan.
 
-    :returns: a tuple (type, identifier) or ``None`` if not found or \
+    :returns: a tuple (type, identifier) or ``(None, None)`` if not found or \
             an error occurred.
     """
     if src.endswith(".pdf"):
@@ -44,29 +49,43 @@ def find_identifiers(src):
                                   stderr=subprocess.PIPE,
                                   bufsize=1)
     else:
-        return None
+        return (None, None)
 
     while totext.poll() is None:
         extract_full = ' '.join([i.decode("utf-8").strip()
                                 for i in totext.stdout.readlines()])
-        found_isbn = isbn.extract_from_text(extract_full)
-        if isbn:
-            totext.terminate()
-            return ("isbn", found_isbn)
+        # Loop over all the valid identifier types
+        for type in __valid_identifiers__:
+            # Dynamically call the ``extract_from_text`` method for the
+            # associated module.
+            m = sys.modules.get("libbmc.%s" % (type,), None)
+            if m is None:
+                continue
+            found_id = getattr(m, "extract_from_text")(extract_full)
+            if found_id:
+                totext.terminate()
+                return (type, found_id[0])  # found_id is a list of found IDs
+    return (None, None)
 
-        found_doi = doi.extract_from_text(extract_full)
-        if doi:
-            totext.terminate()
-            return ("doi", found_doi)
 
-        found_arxiv = arxiv.extract_from_text(extract_full)
-        if arxiv:
-            totext.terminate()
-            return ("arxiv", found_arxiv)
+def get_bibtex(identifier):
+    """
+    Try to fetch BibTeX from a found identifier.
 
-        found_hal = hal.extract_from_text(extract_full)
-        if hal:
-            totext.terminate()
-            return ("hal", found_hal)
+    .. note::
 
-    return None
+        Calls the functions in the respective identifiers module.
+
+    :param identifier: a tuple (type, identifier) with a valid type.
+    :returns: A BibTeX string or ``None`` if an error occurred.
+    # TODO: Should return a BiBTeX object?
+    """
+    type, id = identifier
+    if type not in __valid_identifiers__:
+        return None
+
+    # Dynamically call the ``get_bibtex`` method from the associated module.
+    m = sys.modules.get("libbmc.%s" % (type,), None)
+    if m is None:
+        return None
+    return getattr(m, "get_bibtex")(id)
diff --git a/libbmc/papers/tearpages.py b/libbmc/papers/tearpages.py
index 24a2f9f..a58af58 100644
--- a/libbmc/papers/tearpages.py
+++ b/libbmc/papers/tearpages.py
@@ -100,7 +100,7 @@ def tearpage_needed(bibtex):
 
 def tearpage(filename, bibtex=None, force=False):
     """
-    Tear the some pages of the file if needed.
+    Tear some pages of the file if needed.
 
     :params filename: Path to the file to handle.
     :params bibtex: BibTeX dict associated to this file, as the one given by \
diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py
index bea7f1f..335f480 100644
--- a/libbmc/repositories/arxiv.py
+++ b/libbmc/repositories/arxiv.py
@@ -13,9 +13,13 @@ from urllib.error import HTTPError
 from requests.exceptions import RequestException
 
 
+from libbmc import __valid_identifiers__
 from libbmc import tools
 from libbmc.citations import bbl
 
+# Append arXiv to the valid identifiers list
+__valid_identifiers__ += ["repositories.arxiv"]
+
 
 ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
 ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
@@ -305,12 +309,14 @@ def extract_from_text(text):
     Extract arXiv IDs from a text.
 
     :param text: The text to extract arXiv IDs from.
-    :returns: A list of matching arXiv IDs.
+    :returns: A list of matching arXiv IDs, in canonical form.
 
     >>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136'))
-    ['1506.06690', '1506.06690v1', 'arXiv:1506.06690', 'arXiv:1506.06690v1', 'arxiv:1506.06690', 'arxiv:1506.06690v1', 'math.GT/0309136']
+    ['1506.06690', '1506.06690v1', 'math.GT/0309136']
     """
-    return tools.remove_duplicates([i[0]
+    # Remove the leading "arxiv:".
+    return tools.remove_duplicates([re.sub("arxiv:", "", i[0],
+                                           flags=re.IGNORECASE)
                                     for i in REGEX.findall(text) if i[0] != ''])
 
 
@@ -335,7 +341,7 @@ def to_URL(arxiv_ids):
 
 def to_canonical(urls):
     """
-    Convert a list of DOIs URLs to a list of canonical DOIs.
+    Convert a list of arXiv IDs to a list of canonical IDs.
 
     :param dois: A list of DOIs URLs.
     :returns: List of canonical DOIs. ``None`` if an error occurred.