Add identifiers fetching from papers

This commit is contained in:
Lucas Verney 2016-01-10 15:12:06 +01:00
parent f688534e33
commit ba564be738
3 changed files with 98 additions and 2 deletions
libbmc

@ -0,0 +1,64 @@
"""
This file contains various functions to fetch unique identifiers from papers
(DOIs, arXiv id etc).
Needs pdftotext and/or djvutxt installed on the machine.
"""
import subprocess
from libbmc import doi, isbn
from libbmc.repositories import arxiv, hal
def find_identifiers(src):
"""
Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file.
.. note ::
This function returns the first matching identifier, that is the most
likely to be relevant for this file. However, it may fail and return an
identifier taken from the references or another paper.
:params src: Path to the file to scan.
:returns: a tuple (type, identifier) or ``None`` if not found or \
an error occurred.
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
else:
return None
while totext.poll() is None:
extract_full = ' '.join([i.decode("utf-8").strip()
for i in totext.stdout.readlines()])
found_isbn = isbn.extract_from_text(extract_full)
if isbn:
totext.terminate()
return ("isbn", found_isbn)
found_doi = doi.extract_from_text(extract_full)
if doi:
totext.terminate()
return ("doi", found_doi)
found_arxiv = arxiv.extract_from_text(extract_full)
if arxiv:
totext.terminate()
return ("arxiv", found_arxiv)
found_hal = hal.extract_from_text(extract_full)
if hal:
totext.terminate()
return ("hal", found_hal)
return None

@ -189,7 +189,7 @@ def is_valid(arxiv_id):
def get_bibtex(arxiv_id):
"""
Get a BibTeX entry for a given DOI.
Get a BibTeX entry for a given arXiv ID.
.. note::

@ -1 +1,33 @@
# TODO
"""
This file contains all the HAL-related functions.
TODO:
* Add functions to homogeneize interface with arXiv one.
"""
import re
from libbmc import tools
REGEX = re.compile(r"(hal-\d{8}), version (\d+)")
def is_valid(hal_id):
"""
Check that a given HAL id is a valid one.
:param hal_id: The HAL id to be checked.
:returns: Boolean indicating whether the HAL id is valid or not.
"""
match = REGEX.match(hal_id)
return ((match is not None) and (match.group(0) == hal_id))
def extract_from_text(text):
"""
Extract HAL ids from a text.
:param text: The text to extract HAL ids from.
:returns: A list of matching HAL ids.
"""
return tools.remove_duplicates(REGEX.findall(text))