Add a __valid_identifiers__ list to ease fetching of identifiers in

papers See the detailed explanations in README.md. Also fixed some typos in docstrings.
2016-02-01 17:32:24 +01:00 · 2016-02-01 17:32:24 +01:00 · c785e04589
commit c785e04589
parent 5f42e7ca6c
7 changed files with 85 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -45,6 +45,21 @@ install the matching software (`CERMINE`, `Grobid` or `pdf-extract`). See the
 docstrings of those functions for more infos on this particular point.


+## Note on `__valid_identifiers__`
+
+`libbmc` exposes a `__valid_identifiers__` list, containing the valid
+identifier types. These are those exposing the same function as `doi` or
+`isbn` modules, in particular the extraction from a string and BibTeX
+fetching functions.
+
+If you write additional modules for others repositories, you can include them
+in the `__valid_identifiers__` list, as long as they provide these functions.
+
+This list is especially useful for the `libbmc.papers.identifiers` module,
+which is using it to loop through all the available identifier types, to fetch
+for them in the paper and retrieve BibTeX from it.
+
+
 ## License

 This code is licensed under an MIT license.
--- a/libbmc/init.py
+++ b/libbmc/init.py
@ -1,9 +1,15 @@
-from . import bibtex, doi, fetcher, isbn
-from . import citations, papers, repositories
+# Global list of valid paper identifier types. See README.md.
+__valid_identifiers__ = []

-__version__ = "0.1"
+# Import order of the modules is important, as they will populate
+# `__valid_identifiers__` on load, and the order in this list reflects their
+# priority.
+from . import bibtex, doi, fetcher, isbn  # noqa
+from . import citations, papers, repositories  # noqa
+
+__version__ = "0.1.1"

 __all__ = [
    "bibtex", "doi", "fetcher", "isbn",
-    "citations", "papers", "repositories"
+    "citations", "papers", "repositories",
 ]
--- a/libbmc/doi.py
+++ b/libbmc/doi.py
@ -6,8 +6,12 @@ import requests

 from requests.exceptions import RequestException

+from libbmc import __valid_identifiers__
 from libbmc import tools

+# Append DOI to the valid identifiers list
+__valid_identifiers__ += ["doi"]
+
 # Taken from
 # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
 REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b",
--- a/libbmc/isbn.py
+++ b/libbmc/isbn.py
@ -5,6 +5,11 @@ import isbnlib

 from libbmc import doi

+from libbmc import __valid_identifiers__
+
+# Append ISBN to the valid identifiers list
+__valid_identifiers__ += ["isbn"]
+

 def is_valid(isbn):
    """
--- a/libbmc/papers/identifiers.py
+++ b/libbmc/papers/identifiers.py
@ -6,10 +6,15 @@ Needs pdftotext and/or djvutxt installed on the machine.

 TODO: Unittests
 """
+import importlib
 import subprocess
+import sys

-from libbmc import doi, isbn
-from libbmc.repositories import arxiv, hal
+from libbmc import __valid_identifiers__
+
+# Import all the modules associated to __valid_identifiers__
+for type in __valid_identifiers__:
+    importlib.import_module("libbmc.%s" % (type,))


 def find_identifiers(src):
@ -30,7 +35,7 @@ def find_identifiers(src):

    :params src: Path to the file to scan.

-    :returns: a tuple (type, identifier) or ``None`` if not found or \
+    :returns: a tuple (type, identifier) or ``(None, None)`` if not found or \
            an error occurred.
    """
    if src.endswith(".pdf"):
@ -44,29 +49,43 @@ def find_identifiers(src):
                                  stderr=subprocess.PIPE,
                                  bufsize=1)
    else:
-        return None
+        return (None, None)

    while totext.poll() is None:
        extract_full = ' '.join([i.decode("utf-8").strip()
                                for i in totext.stdout.readlines()])
-        found_isbn = isbn.extract_from_text(extract_full)
-        if isbn:
-            totext.terminate()
-            return ("isbn", found_isbn)
+        # Loop over all the valid identifier types
+        for type in __valid_identifiers__:
+            # Dynamically call the ``extract_from_text`` method for the
+            # associated module.
+            m = sys.modules.get("libbmc.%s" % (type,), None)
+            if m is None:
+                continue
+            found_id = getattr(m, "extract_from_text")(extract_full)
+            if found_id:
+                totext.terminate()
+                return (type, found_id[0])  # found_id is a list of found IDs
+    return (None, None)

-        found_doi = doi.extract_from_text(extract_full)
-        if doi:
-            totext.terminate()
-            return ("doi", found_doi)

-        found_arxiv = arxiv.extract_from_text(extract_full)
-        if arxiv:
-            totext.terminate()
-            return ("arxiv", found_arxiv)
+def get_bibtex(identifier):
+    """
+    Try to fetch BibTeX from a found identifier.

-        found_hal = hal.extract_from_text(extract_full)
-        if hal:
-            totext.terminate()
-            return ("hal", found_hal)
+    .. note::

-    return None
+        Calls the functions in the respective identifiers module.
+
+    :param identifier: a tuple (type, identifier) with a valid type.
+    :returns: A BibTeX string or ``None`` if an error occurred.
+    # TODO: Should return a BiBTeX object?
+    """
+    type, id = identifier
+    if type not in __valid_identifiers__:
+        return None
+
+    # Dynamically call the ``get_bibtex`` method from the associated module.
+    m = sys.modules.get("libbmc.%s" % (type,), None)
+    if m is None:
+        return None
+    return getattr(m, "get_bibtex")(id)
--- a/libbmc/papers/tearpages.py
+++ b/libbmc/papers/tearpages.py
@ -100,7 +100,7 @@ def tearpage_needed(bibtex):

 def tearpage(filename, bibtex=None, force=False):
    """
-    Tear the some pages of the file if needed.
+    Tear some pages of the file if needed.

    :params filename: Path to the file to handle.
    :params bibtex: BibTeX dict associated to this file, as the one given by \
--- a/libbmc/repositories/arxiv.py
+++ b/libbmc/repositories/arxiv.py
@ -13,9 +13,13 @@ from urllib.error import HTTPError
 from requests.exceptions import RequestException


+from libbmc import __valid_identifiers__
 from libbmc import tools
 from libbmc.citations import bbl

+# Append arXiv to the valid identifiers list
+__valid_identifiers__ += ["repositories.arxiv"]
+

 ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
 ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
@ -305,12 +309,14 @@ def extract_from_text(text):
    Extract arXiv IDs from a text.

    :param text: The text to extract arXiv IDs from.
-    :returns: A list of matching arXiv IDs.
+    :returns: A list of matching arXiv IDs, in canonical form.

    >>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136'))
-    ['1506.06690', '1506.06690v1', 'arXiv:1506.06690', 'arXiv:1506.06690v1', 'arxiv:1506.06690', 'arxiv:1506.06690v1', 'math.GT/0309136']
+    ['1506.06690', '1506.06690v1', 'math.GT/0309136']
    """
-    return tools.remove_duplicates([i[0]
+    # Remove the leading "arxiv:".
+    return tools.remove_duplicates([re.sub("arxiv:", "", i[0],
+                                           flags=re.IGNORECASE)
                                    for i in REGEX.findall(text) if i[0] != ''])


@ -335,7 +341,7 @@ def to_URL(arxiv_ids):

 def to_canonical(urls):
    """
-    Convert a list of DOIs URLs to a list of canonical DOIs.
+    Convert a list of arXiv IDs to a list of canonical IDs.

    :param dois: A list of DOIs URLs.
    :returns: List of canonical DOIs. ``None`` if an error occurred.