Add a __valid_identifiers__ list to ease fetching of identifiers in
papers See the detailed explanations in README.md. Also fixed some typos in docstrings.
This commit is contained in:
parent
5f42e7ca6c
commit
c785e04589
15
README.md
15
README.md
@ -45,6 +45,21 @@ install the matching software (`CERMINE`, `Grobid` or `pdf-extract`). See the
|
||||
docstrings of those functions for more infos on this particular point.
|
||||
|
||||
|
||||
## Note on `__valid_identifiers__`
|
||||
|
||||
`libbmc` exposes a `__valid_identifiers__` list, containing the valid
|
||||
identifier types. These are those exposing the same function as `doi` or
|
||||
`isbn` modules, in particular the extraction from a string and BibTeX
|
||||
fetching functions.
|
||||
|
||||
If you write additional modules for others repositories, you can include them
|
||||
in the `__valid_identifiers__` list, as long as they provide these functions.
|
||||
|
||||
This list is especially useful for the `libbmc.papers.identifiers` module,
|
||||
which is using it to loop through all the available identifier types, to fetch
|
||||
for them in the paper and retrieve BibTeX from it.
|
||||
|
||||
|
||||
## License
|
||||
|
||||
This code is licensed under an MIT license.
|
||||
|
@ -1,9 +1,15 @@
|
||||
from . import bibtex, doi, fetcher, isbn
|
||||
from . import citations, papers, repositories
|
||||
# Global list of valid paper identifier types. See README.md.
|
||||
__valid_identifiers__ = []
|
||||
|
||||
__version__ = "0.1"
|
||||
# Import order of the modules is important, as they will populate
|
||||
# `__valid_identifiers__` on load, and the order in this list reflects their
|
||||
# priority.
|
||||
from . import bibtex, doi, fetcher, isbn # noqa
|
||||
from . import citations, papers, repositories # noqa
|
||||
|
||||
__version__ = "0.1.1"
|
||||
|
||||
__all__ = [
|
||||
"bibtex", "doi", "fetcher", "isbn",
|
||||
"citations", "papers", "repositories"
|
||||
"citations", "papers", "repositories",
|
||||
]
|
||||
|
@ -6,8 +6,12 @@ import requests
|
||||
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
from libbmc import __valid_identifiers__
|
||||
from libbmc import tools
|
||||
|
||||
# Append DOI to the valid identifiers list
|
||||
__valid_identifiers__ += ["doi"]
|
||||
|
||||
# Taken from
|
||||
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
||||
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b",
|
||||
|
@ -5,6 +5,11 @@ import isbnlib
|
||||
|
||||
from libbmc import doi
|
||||
|
||||
from libbmc import __valid_identifiers__
|
||||
|
||||
# Append ISBN to the valid identifiers list
|
||||
__valid_identifiers__ += ["isbn"]
|
||||
|
||||
|
||||
def is_valid(isbn):
|
||||
"""
|
||||
|
@ -6,10 +6,15 @@ Needs pdftotext and/or djvutxt installed on the machine.
|
||||
|
||||
TODO: Unittests
|
||||
"""
|
||||
import importlib
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from libbmc import doi, isbn
|
||||
from libbmc.repositories import arxiv, hal
|
||||
from libbmc import __valid_identifiers__
|
||||
|
||||
# Import all the modules associated to __valid_identifiers__
|
||||
for type in __valid_identifiers__:
|
||||
importlib.import_module("libbmc.%s" % (type,))
|
||||
|
||||
|
||||
def find_identifiers(src):
|
||||
@ -30,7 +35,7 @@ def find_identifiers(src):
|
||||
|
||||
:params src: Path to the file to scan.
|
||||
|
||||
:returns: a tuple (type, identifier) or ``None`` if not found or \
|
||||
:returns: a tuple (type, identifier) or ``(None, None)`` if not found or \
|
||||
an error occurred.
|
||||
"""
|
||||
if src.endswith(".pdf"):
|
||||
@ -44,29 +49,43 @@ def find_identifiers(src):
|
||||
stderr=subprocess.PIPE,
|
||||
bufsize=1)
|
||||
else:
|
||||
return None
|
||||
return (None, None)
|
||||
|
||||
while totext.poll() is None:
|
||||
extract_full = ' '.join([i.decode("utf-8").strip()
|
||||
for i in totext.stdout.readlines()])
|
||||
found_isbn = isbn.extract_from_text(extract_full)
|
||||
if isbn:
|
||||
totext.terminate()
|
||||
return ("isbn", found_isbn)
|
||||
# Loop over all the valid identifier types
|
||||
for type in __valid_identifiers__:
|
||||
# Dynamically call the ``extract_from_text`` method for the
|
||||
# associated module.
|
||||
m = sys.modules.get("libbmc.%s" % (type,), None)
|
||||
if m is None:
|
||||
continue
|
||||
found_id = getattr(m, "extract_from_text")(extract_full)
|
||||
if found_id:
|
||||
totext.terminate()
|
||||
return (type, found_id[0]) # found_id is a list of found IDs
|
||||
return (None, None)
|
||||
|
||||
found_doi = doi.extract_from_text(extract_full)
|
||||
if doi:
|
||||
totext.terminate()
|
||||
return ("doi", found_doi)
|
||||
|
||||
found_arxiv = arxiv.extract_from_text(extract_full)
|
||||
if arxiv:
|
||||
totext.terminate()
|
||||
return ("arxiv", found_arxiv)
|
||||
def get_bibtex(identifier):
|
||||
"""
|
||||
Try to fetch BibTeX from a found identifier.
|
||||
|
||||
found_hal = hal.extract_from_text(extract_full)
|
||||
if hal:
|
||||
totext.terminate()
|
||||
return ("hal", found_hal)
|
||||
.. note::
|
||||
|
||||
return None
|
||||
Calls the functions in the respective identifiers module.
|
||||
|
||||
:param identifier: a tuple (type, identifier) with a valid type.
|
||||
:returns: A BibTeX string or ``None`` if an error occurred.
|
||||
# TODO: Should return a BiBTeX object?
|
||||
"""
|
||||
type, id = identifier
|
||||
if type not in __valid_identifiers__:
|
||||
return None
|
||||
|
||||
# Dynamically call the ``get_bibtex`` method from the associated module.
|
||||
m = sys.modules.get("libbmc.%s" % (type,), None)
|
||||
if m is None:
|
||||
return None
|
||||
return getattr(m, "get_bibtex")(id)
|
||||
|
@ -100,7 +100,7 @@ def tearpage_needed(bibtex):
|
||||
|
||||
def tearpage(filename, bibtex=None, force=False):
|
||||
"""
|
||||
Tear the some pages of the file if needed.
|
||||
Tear some pages of the file if needed.
|
||||
|
||||
:params filename: Path to the file to handle.
|
||||
:params bibtex: BibTeX dict associated to this file, as the one given by \
|
||||
|
@ -13,9 +13,13 @@ from urllib.error import HTTPError
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
|
||||
from libbmc import __valid_identifiers__
|
||||
from libbmc import tools
|
||||
from libbmc.citations import bbl
|
||||
|
||||
# Append arXiv to the valid identifiers list
|
||||
__valid_identifiers__ += ["repositories.arxiv"]
|
||||
|
||||
|
||||
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
||||
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
|
||||
@ -305,12 +309,14 @@ def extract_from_text(text):
|
||||
Extract arXiv IDs from a text.
|
||||
|
||||
:param text: The text to extract arXiv IDs from.
|
||||
:returns: A list of matching arXiv IDs.
|
||||
:returns: A list of matching arXiv IDs, in canonical form.
|
||||
|
||||
>>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136'))
|
||||
['1506.06690', '1506.06690v1', 'arXiv:1506.06690', 'arXiv:1506.06690v1', 'arxiv:1506.06690', 'arxiv:1506.06690v1', 'math.GT/0309136']
|
||||
['1506.06690', '1506.06690v1', 'math.GT/0309136']
|
||||
"""
|
||||
return tools.remove_duplicates([i[0]
|
||||
# Remove the leading "arxiv:".
|
||||
return tools.remove_duplicates([re.sub("arxiv:", "", i[0],
|
||||
flags=re.IGNORECASE)
|
||||
for i in REGEX.findall(text) if i[0] != ''])
|
||||
|
||||
|
||||
@ -335,7 +341,7 @@ def to_URL(arxiv_ids):
|
||||
|
||||
def to_canonical(urls):
|
||||
"""
|
||||
Convert a list of DOIs URLs to a list of canonical DOIs.
|
||||
Convert a list of arXiv IDs to a list of canonical IDs.
|
||||
|
||||
:param dois: A list of DOIs URLs.
|
||||
:returns: List of canonical DOIs. ``None`` if an error occurred.
|
||||
|
Loading…
Reference in New Issue
Block a user