Add a __valid_identifiers__ list to ease fetching of identifiers in

papers

See the detailed explanations in README.md.

Also fixed some typos in docstrings.
This commit is contained in:
Lucas Verney 2016-02-01 17:32:24 +01:00
parent 5f42e7ca6c
commit c785e04589
7 changed files with 85 additions and 30 deletions

View File

@ -45,6 +45,21 @@ install the matching software (`CERMINE`, `Grobid` or `pdf-extract`). See the
docstrings of those functions for more infos on this particular point. docstrings of those functions for more infos on this particular point.
## Note on `__valid_identifiers__`
`libbmc` exposes a `__valid_identifiers__` list, containing the valid
identifier types. These are those exposing the same function as `doi` or
`isbn` modules, in particular the extraction from a string and BibTeX
fetching functions.
If you write additional modules for others repositories, you can include them
in the `__valid_identifiers__` list, as long as they provide these functions.
This list is especially useful for the `libbmc.papers.identifiers` module,
which is using it to loop through all the available identifier types, to fetch
for them in the paper and retrieve BibTeX from it.
## License ## License
This code is licensed under an MIT license. This code is licensed under an MIT license.

View File

@ -1,9 +1,15 @@
from . import bibtex, doi, fetcher, isbn # Global list of valid paper identifier types. See README.md.
from . import citations, papers, repositories __valid_identifiers__ = []
__version__ = "0.1" # Import order of the modules is important, as they will populate
# `__valid_identifiers__` on load, and the order in this list reflects their
# priority.
from . import bibtex, doi, fetcher, isbn # noqa
from . import citations, papers, repositories # noqa
__version__ = "0.1.1"
__all__ = [ __all__ = [
"bibtex", "doi", "fetcher", "isbn", "bibtex", "doi", "fetcher", "isbn",
"citations", "papers", "repositories" "citations", "papers", "repositories",
] ]

View File

@ -6,8 +6,12 @@ import requests
from requests.exceptions import RequestException from requests.exceptions import RequestException
from libbmc import __valid_identifiers__
from libbmc import tools from libbmc import tools
# Append DOI to the valid identifiers list
__valid_identifiers__ += ["doi"]
# Taken from # Taken from
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802 # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b", REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b",

View File

@ -5,6 +5,11 @@ import isbnlib
from libbmc import doi from libbmc import doi
from libbmc import __valid_identifiers__
# Append ISBN to the valid identifiers list
__valid_identifiers__ += ["isbn"]
def is_valid(isbn): def is_valid(isbn):
""" """

View File

@ -6,10 +6,15 @@ Needs pdftotext and/or djvutxt installed on the machine.
TODO: Unittests TODO: Unittests
""" """
import importlib
import subprocess import subprocess
import sys
from libbmc import doi, isbn from libbmc import __valid_identifiers__
from libbmc.repositories import arxiv, hal
# Import all the modules associated to __valid_identifiers__
for type in __valid_identifiers__:
importlib.import_module("libbmc.%s" % (type,))
def find_identifiers(src): def find_identifiers(src):
@ -30,7 +35,7 @@ def find_identifiers(src):
:params src: Path to the file to scan. :params src: Path to the file to scan.
:returns: a tuple (type, identifier) or ``None`` if not found or \ :returns: a tuple (type, identifier) or ``(None, None)`` if not found or \
an error occurred. an error occurred.
""" """
if src.endswith(".pdf"): if src.endswith(".pdf"):
@ -44,29 +49,43 @@ def find_identifiers(src):
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
bufsize=1) bufsize=1)
else: else:
return None return (None, None)
while totext.poll() is None: while totext.poll() is None:
extract_full = ' '.join([i.decode("utf-8").strip() extract_full = ' '.join([i.decode("utf-8").strip()
for i in totext.stdout.readlines()]) for i in totext.stdout.readlines()])
found_isbn = isbn.extract_from_text(extract_full) # Loop over all the valid identifier types
if isbn: for type in __valid_identifiers__:
totext.terminate() # Dynamically call the ``extract_from_text`` method for the
return ("isbn", found_isbn) # associated module.
m = sys.modules.get("libbmc.%s" % (type,), None)
if m is None:
continue
found_id = getattr(m, "extract_from_text")(extract_full)
if found_id:
totext.terminate()
return (type, found_id[0]) # found_id is a list of found IDs
return (None, None)
found_doi = doi.extract_from_text(extract_full)
if doi:
totext.terminate()
return ("doi", found_doi)
found_arxiv = arxiv.extract_from_text(extract_full) def get_bibtex(identifier):
if arxiv: """
totext.terminate() Try to fetch BibTeX from a found identifier.
return ("arxiv", found_arxiv)
found_hal = hal.extract_from_text(extract_full) .. note::
if hal:
totext.terminate()
return ("hal", found_hal)
return None Calls the functions in the respective identifiers module.
:param identifier: a tuple (type, identifier) with a valid type.
:returns: A BibTeX string or ``None`` if an error occurred.
# TODO: Should return a BiBTeX object?
"""
type, id = identifier
if type not in __valid_identifiers__:
return None
# Dynamically call the ``get_bibtex`` method from the associated module.
m = sys.modules.get("libbmc.%s" % (type,), None)
if m is None:
return None
return getattr(m, "get_bibtex")(id)

View File

@ -100,7 +100,7 @@ def tearpage_needed(bibtex):
def tearpage(filename, bibtex=None, force=False): def tearpage(filename, bibtex=None, force=False):
""" """
Tear the some pages of the file if needed. Tear some pages of the file if needed.
:params filename: Path to the file to handle. :params filename: Path to the file to handle.
:params bibtex: BibTeX dict associated to this file, as the one given by \ :params bibtex: BibTeX dict associated to this file, as the one given by \

View File

@ -13,9 +13,13 @@ from urllib.error import HTTPError
from requests.exceptions import RequestException from requests.exceptions import RequestException
from libbmc import __valid_identifiers__
from libbmc import tools from libbmc import tools
from libbmc.citations import bbl from libbmc.citations import bbl
# Append arXiv to the valid identifiers list
__valid_identifiers__ += ["repositories.arxiv"]
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?" ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([ ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
@ -305,12 +309,14 @@ def extract_from_text(text):
Extract arXiv IDs from a text. Extract arXiv IDs from a text.
:param text: The text to extract arXiv IDs from. :param text: The text to extract arXiv IDs from.
:returns: A list of matching arXiv IDs. :returns: A list of matching arXiv IDs, in canonical form.
>>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136')) >>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136'))
['1506.06690', '1506.06690v1', 'arXiv:1506.06690', 'arXiv:1506.06690v1', 'arxiv:1506.06690', 'arxiv:1506.06690v1', 'math.GT/0309136'] ['1506.06690', '1506.06690v1', 'math.GT/0309136']
""" """
return tools.remove_duplicates([i[0] # Remove the leading "arxiv:".
return tools.remove_duplicates([re.sub("arxiv:", "", i[0],
flags=re.IGNORECASE)
for i in REGEX.findall(text) if i[0] != '']) for i in REGEX.findall(text) if i[0] != ''])
@ -335,7 +341,7 @@ def to_URL(arxiv_ids):
def to_canonical(urls): def to_canonical(urls):
""" """
Convert a list of DOIs URLs to a list of canonical DOIs. Convert a list of arXiv IDs to a list of canonical IDs.
:param dois: A list of DOIs URLs. :param dois: A list of DOIs URLs.
:returns: List of canonical DOIs. ``None`` if an error occurred. :returns: List of canonical DOIs. ``None`` if an error occurred.