From c785e04589f4c9182fb62a9ae541689a97435185 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Mon, 1 Feb 2016 17:32:24 +0100 Subject: [PATCH] Add a __valid_identifiers__ list to ease fetching of identifiers in papers See the detailed explanations in README.md. Also fixed some typos in docstrings. --- README.md | 15 +++++++++ libbmc/__init__.py | 14 ++++++--- libbmc/doi.py | 4 +++ libbmc/isbn.py | 5 +++ libbmc/papers/identifiers.py | 61 +++++++++++++++++++++++------------- libbmc/papers/tearpages.py | 2 +- libbmc/repositories/arxiv.py | 14 ++++++--- 7 files changed, 85 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 66b6ae5..8dcb003 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,21 @@ install the matching software (`CERMINE`, `Grobid` or `pdf-extract`). See the docstrings of those functions for more infos on this particular point. +## Note on `__valid_identifiers__` + +`libbmc` exposes a `__valid_identifiers__` list, containing the valid +identifier types. These are those exposing the same function as `doi` or +`isbn` modules, in particular the extraction from a string and BibTeX +fetching functions. + +If you write additional modules for others repositories, you can include them +in the `__valid_identifiers__` list, as long as they provide these functions. + +This list is especially useful for the `libbmc.papers.identifiers` module, +which is using it to loop through all the available identifier types, to fetch +for them in the paper and retrieve BibTeX from it. + + ## License This code is licensed under an MIT license. diff --git a/libbmc/__init__.py b/libbmc/__init__.py index 926387a..1946de6 100644 --- a/libbmc/__init__.py +++ b/libbmc/__init__.py @@ -1,9 +1,15 @@ -from . import bibtex, doi, fetcher, isbn -from . import citations, papers, repositories +# Global list of valid paper identifier types. See README.md. +__valid_identifiers__ = [] -__version__ = "0.1" +# Import order of the modules is important, as they will populate +# `__valid_identifiers__` on load, and the order in this list reflects their +# priority. +from . import bibtex, doi, fetcher, isbn # noqa +from . import citations, papers, repositories # noqa + +__version__ = "0.1.1" __all__ = [ "bibtex", "doi", "fetcher", "isbn", - "citations", "papers", "repositories" + "citations", "papers", "repositories", ] diff --git a/libbmc/doi.py b/libbmc/doi.py index bfc5da8..3fee128 100644 --- a/libbmc/doi.py +++ b/libbmc/doi.py @@ -6,8 +6,12 @@ import requests from requests.exceptions import RequestException +from libbmc import __valid_identifiers__ from libbmc import tools +# Append DOI to the valid identifiers list +__valid_identifiers__ += ["doi"] + # Taken from # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802 REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b", diff --git a/libbmc/isbn.py b/libbmc/isbn.py index 5c5904a..fe0e1c4 100644 --- a/libbmc/isbn.py +++ b/libbmc/isbn.py @@ -5,6 +5,11 @@ import isbnlib from libbmc import doi +from libbmc import __valid_identifiers__ + +# Append ISBN to the valid identifiers list +__valid_identifiers__ += ["isbn"] + def is_valid(isbn): """ diff --git a/libbmc/papers/identifiers.py b/libbmc/papers/identifiers.py index 32e340f..abca898 100644 --- a/libbmc/papers/identifiers.py +++ b/libbmc/papers/identifiers.py @@ -6,10 +6,15 @@ Needs pdftotext and/or djvutxt installed on the machine. TODO: Unittests """ +import importlib import subprocess +import sys -from libbmc import doi, isbn -from libbmc.repositories import arxiv, hal +from libbmc import __valid_identifiers__ + +# Import all the modules associated to __valid_identifiers__ +for type in __valid_identifiers__: + importlib.import_module("libbmc.%s" % (type,)) def find_identifiers(src): @@ -30,7 +35,7 @@ def find_identifiers(src): :params src: Path to the file to scan. - :returns: a tuple (type, identifier) or ``None`` if not found or \ + :returns: a tuple (type, identifier) or ``(None, None)`` if not found or \ an error occurred. """ if src.endswith(".pdf"): @@ -44,29 +49,43 @@ def find_identifiers(src): stderr=subprocess.PIPE, bufsize=1) else: - return None + return (None, None) while totext.poll() is None: extract_full = ' '.join([i.decode("utf-8").strip() for i in totext.stdout.readlines()]) - found_isbn = isbn.extract_from_text(extract_full) - if isbn: - totext.terminate() - return ("isbn", found_isbn) + # Loop over all the valid identifier types + for type in __valid_identifiers__: + # Dynamically call the ``extract_from_text`` method for the + # associated module. + m = sys.modules.get("libbmc.%s" % (type,), None) + if m is None: + continue + found_id = getattr(m, "extract_from_text")(extract_full) + if found_id: + totext.terminate() + return (type, found_id[0]) # found_id is a list of found IDs + return (None, None) - found_doi = doi.extract_from_text(extract_full) - if doi: - totext.terminate() - return ("doi", found_doi) - found_arxiv = arxiv.extract_from_text(extract_full) - if arxiv: - totext.terminate() - return ("arxiv", found_arxiv) +def get_bibtex(identifier): + """ + Try to fetch BibTeX from a found identifier. - found_hal = hal.extract_from_text(extract_full) - if hal: - totext.terminate() - return ("hal", found_hal) + .. note:: - return None + Calls the functions in the respective identifiers module. + + :param identifier: a tuple (type, identifier) with a valid type. + :returns: A BibTeX string or ``None`` if an error occurred. + # TODO: Should return a BiBTeX object? + """ + type, id = identifier + if type not in __valid_identifiers__: + return None + + # Dynamically call the ``get_bibtex`` method from the associated module. + m = sys.modules.get("libbmc.%s" % (type,), None) + if m is None: + return None + return getattr(m, "get_bibtex")(id) diff --git a/libbmc/papers/tearpages.py b/libbmc/papers/tearpages.py index 24a2f9f..a58af58 100644 --- a/libbmc/papers/tearpages.py +++ b/libbmc/papers/tearpages.py @@ -100,7 +100,7 @@ def tearpage_needed(bibtex): def tearpage(filename, bibtex=None, force=False): """ - Tear the some pages of the file if needed. + Tear some pages of the file if needed. :params filename: Path to the file to handle. :params bibtex: BibTeX dict associated to this file, as the one given by \ diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py index bea7f1f..335f480 100644 --- a/libbmc/repositories/arxiv.py +++ b/libbmc/repositories/arxiv.py @@ -13,9 +13,13 @@ from urllib.error import HTTPError from requests.exceptions import RequestException +from libbmc import __valid_identifiers__ from libbmc import tools from libbmc.citations import bbl +# Append arXiv to the valid identifiers list +__valid_identifiers__ += ["repositories.arxiv"] + ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?" ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([ @@ -305,12 +309,14 @@ def extract_from_text(text): Extract arXiv IDs from a text. :param text: The text to extract arXiv IDs from. - :returns: A list of matching arXiv IDs. + :returns: A list of matching arXiv IDs, in canonical form. >>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136')) - ['1506.06690', '1506.06690v1', 'arXiv:1506.06690', 'arXiv:1506.06690v1', 'arxiv:1506.06690', 'arxiv:1506.06690v1', 'math.GT/0309136'] + ['1506.06690', '1506.06690v1', 'math.GT/0309136'] """ - return tools.remove_duplicates([i[0] + # Remove the leading "arxiv:". + return tools.remove_duplicates([re.sub("arxiv:", "", i[0], + flags=re.IGNORECASE) for i in REGEX.findall(text) if i[0] != '']) @@ -335,7 +341,7 @@ def to_URL(arxiv_ids): def to_canonical(urls): """ - Convert a list of DOIs URLs to a list of canonical DOIs. + Convert a list of arXiv IDs to a list of canonical IDs. :param dois: A list of DOIs URLs. :returns: List of canonical DOIs. ``None`` if an error occurred.