Add identifiers fetching from papers

This commit is contained in:
Lucas Verney 2016-01-10 15:12:06 +01:00
parent f688534e33
commit ba564be738
3 changed files with 98 additions and 2 deletions

View File

@ -0,0 +1,64 @@
"""
This file contains various functions to fetch unique identifiers from papers
(DOIs, arXiv id etc).
Needs pdftotext and/or djvutxt installed on the machine.
"""
import subprocess
from libbmc import doi, isbn
from libbmc.repositories import arxiv, hal
def find_identifiers(src):
"""
Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file.
.. note ::
This function returns the first matching identifier, that is the most
likely to be relevant for this file. However, it may fail and return an
identifier taken from the references or another paper.
:params src: Path to the file to scan.
:returns: a tuple (type, identifier) or ``None`` if not found or \
an error occurred.
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
else:
return None
while totext.poll() is None:
extract_full = ' '.join([i.decode("utf-8").strip()
for i in totext.stdout.readlines()])
found_isbn = isbn.extract_from_text(extract_full)
if isbn:
totext.terminate()
return ("isbn", found_isbn)
found_doi = doi.extract_from_text(extract_full)
if doi:
totext.terminate()
return ("doi", found_doi)
found_arxiv = arxiv.extract_from_text(extract_full)
if arxiv:
totext.terminate()
return ("arxiv", found_arxiv)
found_hal = hal.extract_from_text(extract_full)
if hal:
totext.terminate()
return ("hal", found_hal)
return None

View File

@ -189,7 +189,7 @@ def is_valid(arxiv_id):
def get_bibtex(arxiv_id):
"""
Get a BibTeX entry for a given DOI.
Get a BibTeX entry for a given arXiv ID.
.. note::

View File

@ -1 +1,33 @@
# TODO
"""
This file contains all the HAL-related functions.
TODO:
* Add functions to homogeneize interface with arXiv one.
"""
import re
from libbmc import tools
REGEX = re.compile(r"(hal-\d{8}), version (\d+)")
def is_valid(hal_id):
"""
Check that a given HAL id is a valid one.
:param hal_id: The HAL id to be checked.
:returns: Boolean indicating whether the HAL id is valid or not.
"""
match = REGEX.match(hal_id)
return ((match is not None) and (match.group(0) == hal_id))
def extract_from_text(text):
"""
Extract HAL ids from a text.
:param text: The text to extract HAL ids from.
:returns: A list of matching HAL ids.
"""
return tools.remove_duplicates(REGEX.findall(text))