2016-01-10 15:12:06 +01:00
|
|
|
"""
|
|
|
|
This file contains various functions to fetch unique identifiers from papers
|
|
|
|
(DOIs, arXiv id etc).
|
|
|
|
|
|
|
|
Needs pdftotext and/or djvutxt installed on the machine.
|
2016-01-20 23:40:07 +01:00
|
|
|
|
|
|
|
TODO: Unittests
|
2016-01-10 15:12:06 +01:00
|
|
|
"""
|
2016-02-01 17:32:24 +01:00
|
|
|
import importlib
|
2016-01-10 15:12:06 +01:00
|
|
|
import subprocess
|
2016-02-01 17:32:24 +01:00
|
|
|
import sys
|
2016-01-10 15:12:06 +01:00
|
|
|
|
2016-02-01 17:32:24 +01:00
|
|
|
from libbmc import __valid_identifiers__
|
|
|
|
|
|
|
|
# Import all the modules associated to __valid_identifiers__
|
2016-02-17 16:04:36 +01:00
|
|
|
for valid_identifier in __valid_identifiers__:
|
|
|
|
importlib.import_module("libbmc.%s" % (valid_identifier,))
|
2016-01-10 15:12:06 +01:00
|
|
|
|
|
|
|
|
|
|
|
def find_identifiers(src):
|
|
|
|
"""
|
|
|
|
Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file.
|
|
|
|
|
2016-01-19 18:17:12 +01:00
|
|
|
.. note::
|
2016-01-10 15:12:06 +01:00
|
|
|
|
|
|
|
This function returns the first matching identifier, that is the most
|
|
|
|
likely to be relevant for this file. However, it may fail and return an
|
|
|
|
identifier taken from the references or another paper.
|
|
|
|
|
2016-01-19 18:17:12 +01:00
|
|
|
.. note::
|
|
|
|
|
|
|
|
You will need to have ``pdftotext`` and/or ``djvutxt`` installed \
|
|
|
|
system-wide before processing files with this function.
|
|
|
|
|
|
|
|
|
2016-01-10 15:12:06 +01:00
|
|
|
:params src: Path to the file to scan.
|
|
|
|
|
2016-02-01 17:32:24 +01:00
|
|
|
:returns: a tuple (type, identifier) or ``(None, None)`` if not found or \
|
2016-01-10 15:12:06 +01:00
|
|
|
an error occurred.
|
|
|
|
"""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
bufsize=1)
|
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
bufsize=1)
|
|
|
|
else:
|
2016-02-01 17:32:24 +01:00
|
|
|
return (None, None)
|
2016-01-10 15:12:06 +01:00
|
|
|
|
|
|
|
while totext.poll() is None:
|
|
|
|
extract_full = ' '.join([i.decode("utf-8").strip()
|
2016-02-17 16:04:36 +01:00
|
|
|
for i in totext.stdout.readlines()])
|
2016-02-01 17:32:24 +01:00
|
|
|
# Loop over all the valid identifier types
|
2016-02-17 16:04:36 +01:00
|
|
|
for identifier in __valid_identifiers__:
|
2016-02-01 17:32:24 +01:00
|
|
|
# Dynamically call the ``extract_from_text`` method for the
|
|
|
|
# associated module.
|
2016-02-17 16:04:36 +01:00
|
|
|
module = sys.modules.get("libbmc.%s" % (identifier,), None)
|
|
|
|
if module is None:
|
2016-02-01 17:32:24 +01:00
|
|
|
continue
|
2016-02-17 16:04:36 +01:00
|
|
|
found_id = getattr(module, "extract_from_text")(extract_full)
|
2016-02-01 17:32:24 +01:00
|
|
|
if found_id:
|
|
|
|
totext.terminate()
|
2016-02-17 16:04:36 +01:00
|
|
|
# found_id is a list of found IDs
|
|
|
|
return (identifier, found_id[0])
|
2016-02-01 17:32:24 +01:00
|
|
|
return (None, None)
|
|
|
|
|
|
|
|
|
|
|
|
def get_bibtex(identifier):
|
|
|
|
"""
|
|
|
|
Try to fetch BibTeX from a found identifier.
|
|
|
|
|
|
|
|
.. note::
|
|
|
|
|
|
|
|
Calls the functions in the respective identifiers module.
|
|
|
|
|
|
|
|
:param identifier: a tuple (type, identifier) with a valid type.
|
|
|
|
:returns: A BibTeX string or ``None`` if an error occurred.
|
|
|
|
# TODO: Should return a BiBTeX object?
|
|
|
|
"""
|
2016-02-17 16:04:36 +01:00
|
|
|
identifier_type, identifier_id = identifier
|
|
|
|
if identifier_type not in __valid_identifiers__:
|
2016-02-01 17:32:24 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
# Dynamically call the ``get_bibtex`` method from the associated module.
|
2016-02-17 16:04:36 +01:00
|
|
|
module = sys.modules.get("libbmc.%s" % (identifier_type,), None)
|
|
|
|
if module is None:
|
2016-02-01 17:32:24 +01:00
|
|
|
return None
|
2016-02-17 16:04:36 +01:00
|
|
|
return getattr(module, "get_bibtex")(identifier_id)
|