Add a __valid_identifiers__ list to ease fetching of identifiers in
papers See the detailed explanations in README.md. Also fixed some typos in docstrings.
This commit is contained in:
parent
5f42e7ca6c
commit
c785e04589
15
README.md
15
README.md
@ -45,6 +45,21 @@ install the matching software (`CERMINE`, `Grobid` or `pdf-extract`). See the
|
|||||||
docstrings of those functions for more infos on this particular point.
|
docstrings of those functions for more infos on this particular point.
|
||||||
|
|
||||||
|
|
||||||
|
## Note on `__valid_identifiers__`
|
||||||
|
|
||||||
|
`libbmc` exposes a `__valid_identifiers__` list, containing the valid
|
||||||
|
identifier types. These are those exposing the same function as `doi` or
|
||||||
|
`isbn` modules, in particular the extraction from a string and BibTeX
|
||||||
|
fetching functions.
|
||||||
|
|
||||||
|
If you write additional modules for others repositories, you can include them
|
||||||
|
in the `__valid_identifiers__` list, as long as they provide these functions.
|
||||||
|
|
||||||
|
This list is especially useful for the `libbmc.papers.identifiers` module,
|
||||||
|
which is using it to loop through all the available identifier types, to fetch
|
||||||
|
for them in the paper and retrieve BibTeX from it.
|
||||||
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
This code is licensed under an MIT license.
|
This code is licensed under an MIT license.
|
||||||
|
@ -1,9 +1,15 @@
|
|||||||
from . import bibtex, doi, fetcher, isbn
|
# Global list of valid paper identifier types. See README.md.
|
||||||
from . import citations, papers, repositories
|
__valid_identifiers__ = []
|
||||||
|
|
||||||
__version__ = "0.1"
|
# Import order of the modules is important, as they will populate
|
||||||
|
# `__valid_identifiers__` on load, and the order in this list reflects their
|
||||||
|
# priority.
|
||||||
|
from . import bibtex, doi, fetcher, isbn # noqa
|
||||||
|
from . import citations, papers, repositories # noqa
|
||||||
|
|
||||||
|
__version__ = "0.1.1"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"bibtex", "doi", "fetcher", "isbn",
|
"bibtex", "doi", "fetcher", "isbn",
|
||||||
"citations", "papers", "repositories"
|
"citations", "papers", "repositories",
|
||||||
]
|
]
|
||||||
|
@ -6,8 +6,12 @@ import requests
|
|||||||
|
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
|
from libbmc import __valid_identifiers__
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
|
|
||||||
|
# Append DOI to the valid identifiers list
|
||||||
|
__valid_identifiers__ += ["doi"]
|
||||||
|
|
||||||
# Taken from
|
# Taken from
|
||||||
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
||||||
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b",
|
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b",
|
||||||
|
@ -5,6 +5,11 @@ import isbnlib
|
|||||||
|
|
||||||
from libbmc import doi
|
from libbmc import doi
|
||||||
|
|
||||||
|
from libbmc import __valid_identifiers__
|
||||||
|
|
||||||
|
# Append ISBN to the valid identifiers list
|
||||||
|
__valid_identifiers__ += ["isbn"]
|
||||||
|
|
||||||
|
|
||||||
def is_valid(isbn):
|
def is_valid(isbn):
|
||||||
"""
|
"""
|
||||||
|
@ -6,10 +6,15 @@ Needs pdftotext and/or djvutxt installed on the machine.
|
|||||||
|
|
||||||
TODO: Unittests
|
TODO: Unittests
|
||||||
"""
|
"""
|
||||||
|
import importlib
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
from libbmc import doi, isbn
|
from libbmc import __valid_identifiers__
|
||||||
from libbmc.repositories import arxiv, hal
|
|
||||||
|
# Import all the modules associated to __valid_identifiers__
|
||||||
|
for type in __valid_identifiers__:
|
||||||
|
importlib.import_module("libbmc.%s" % (type,))
|
||||||
|
|
||||||
|
|
||||||
def find_identifiers(src):
|
def find_identifiers(src):
|
||||||
@ -30,7 +35,7 @@ def find_identifiers(src):
|
|||||||
|
|
||||||
:params src: Path to the file to scan.
|
:params src: Path to the file to scan.
|
||||||
|
|
||||||
:returns: a tuple (type, identifier) or ``None`` if not found or \
|
:returns: a tuple (type, identifier) or ``(None, None)`` if not found or \
|
||||||
an error occurred.
|
an error occurred.
|
||||||
"""
|
"""
|
||||||
if src.endswith(".pdf"):
|
if src.endswith(".pdf"):
|
||||||
@ -44,29 +49,43 @@ def find_identifiers(src):
|
|||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
bufsize=1)
|
bufsize=1)
|
||||||
else:
|
else:
|
||||||
return None
|
return (None, None)
|
||||||
|
|
||||||
while totext.poll() is None:
|
while totext.poll() is None:
|
||||||
extract_full = ' '.join([i.decode("utf-8").strip()
|
extract_full = ' '.join([i.decode("utf-8").strip()
|
||||||
for i in totext.stdout.readlines()])
|
for i in totext.stdout.readlines()])
|
||||||
found_isbn = isbn.extract_from_text(extract_full)
|
# Loop over all the valid identifier types
|
||||||
if isbn:
|
for type in __valid_identifiers__:
|
||||||
|
# Dynamically call the ``extract_from_text`` method for the
|
||||||
|
# associated module.
|
||||||
|
m = sys.modules.get("libbmc.%s" % (type,), None)
|
||||||
|
if m is None:
|
||||||
|
continue
|
||||||
|
found_id = getattr(m, "extract_from_text")(extract_full)
|
||||||
|
if found_id:
|
||||||
totext.terminate()
|
totext.terminate()
|
||||||
return ("isbn", found_isbn)
|
return (type, found_id[0]) # found_id is a list of found IDs
|
||||||
|
return (None, None)
|
||||||
|
|
||||||
found_doi = doi.extract_from_text(extract_full)
|
|
||||||
if doi:
|
|
||||||
totext.terminate()
|
|
||||||
return ("doi", found_doi)
|
|
||||||
|
|
||||||
found_arxiv = arxiv.extract_from_text(extract_full)
|
def get_bibtex(identifier):
|
||||||
if arxiv:
|
"""
|
||||||
totext.terminate()
|
Try to fetch BibTeX from a found identifier.
|
||||||
return ("arxiv", found_arxiv)
|
|
||||||
|
|
||||||
found_hal = hal.extract_from_text(extract_full)
|
.. note::
|
||||||
if hal:
|
|
||||||
totext.terminate()
|
|
||||||
return ("hal", found_hal)
|
|
||||||
|
|
||||||
|
Calls the functions in the respective identifiers module.
|
||||||
|
|
||||||
|
:param identifier: a tuple (type, identifier) with a valid type.
|
||||||
|
:returns: A BibTeX string or ``None`` if an error occurred.
|
||||||
|
# TODO: Should return a BiBTeX object?
|
||||||
|
"""
|
||||||
|
type, id = identifier
|
||||||
|
if type not in __valid_identifiers__:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Dynamically call the ``get_bibtex`` method from the associated module.
|
||||||
|
m = sys.modules.get("libbmc.%s" % (type,), None)
|
||||||
|
if m is None:
|
||||||
|
return None
|
||||||
|
return getattr(m, "get_bibtex")(id)
|
||||||
|
@ -100,7 +100,7 @@ def tearpage_needed(bibtex):
|
|||||||
|
|
||||||
def tearpage(filename, bibtex=None, force=False):
|
def tearpage(filename, bibtex=None, force=False):
|
||||||
"""
|
"""
|
||||||
Tear the some pages of the file if needed.
|
Tear some pages of the file if needed.
|
||||||
|
|
||||||
:params filename: Path to the file to handle.
|
:params filename: Path to the file to handle.
|
||||||
:params bibtex: BibTeX dict associated to this file, as the one given by \
|
:params bibtex: BibTeX dict associated to this file, as the one given by \
|
||||||
|
@ -13,9 +13,13 @@ from urllib.error import HTTPError
|
|||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
|
|
||||||
|
from libbmc import __valid_identifiers__
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
from libbmc.citations import bbl
|
from libbmc.citations import bbl
|
||||||
|
|
||||||
|
# Append arXiv to the valid identifiers list
|
||||||
|
__valid_identifiers__ += ["repositories.arxiv"]
|
||||||
|
|
||||||
|
|
||||||
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
||||||
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
|
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
|
||||||
@ -305,12 +309,14 @@ def extract_from_text(text):
|
|||||||
Extract arXiv IDs from a text.
|
Extract arXiv IDs from a text.
|
||||||
|
|
||||||
:param text: The text to extract arXiv IDs from.
|
:param text: The text to extract arXiv IDs from.
|
||||||
:returns: A list of matching arXiv IDs.
|
:returns: A list of matching arXiv IDs, in canonical form.
|
||||||
|
|
||||||
>>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136'))
|
>>> sorted(extract_from_text('1506.06690 1506.06690v1 arXiv:1506.06690 arXiv:1506.06690v1 arxiv:1506.06690 arxiv:1506.06690v1 math.GT/0309136 abcdf bar1506.06690foo mare.GG/0309136'))
|
||||||
['1506.06690', '1506.06690v1', 'arXiv:1506.06690', 'arXiv:1506.06690v1', 'arxiv:1506.06690', 'arxiv:1506.06690v1', 'math.GT/0309136']
|
['1506.06690', '1506.06690v1', 'math.GT/0309136']
|
||||||
"""
|
"""
|
||||||
return tools.remove_duplicates([i[0]
|
# Remove the leading "arxiv:".
|
||||||
|
return tools.remove_duplicates([re.sub("arxiv:", "", i[0],
|
||||||
|
flags=re.IGNORECASE)
|
||||||
for i in REGEX.findall(text) if i[0] != ''])
|
for i in REGEX.findall(text) if i[0] != ''])
|
||||||
|
|
||||||
|
|
||||||
@ -335,7 +341,7 @@ def to_URL(arxiv_ids):
|
|||||||
|
|
||||||
def to_canonical(urls):
|
def to_canonical(urls):
|
||||||
"""
|
"""
|
||||||
Convert a list of DOIs URLs to a list of canonical DOIs.
|
Convert a list of arXiv IDs to a list of canonical IDs.
|
||||||
|
|
||||||
:param dois: A list of DOIs URLs.
|
:param dois: A list of DOIs URLs.
|
||||||
:returns: List of canonical DOIs. ``None`` if an error occurred.
|
:returns: List of canonical DOIs. ``None`` if an error occurred.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user