Add identifiers fetching from papers
This commit is contained in:
parent
f688534e33
commit
ba564be738
64
libbmc/papers/identifiers.py
Normal file
64
libbmc/papers/identifiers.py
Normal file
@ -0,0 +1,64 @@
|
||||
"""
|
||||
This file contains various functions to fetch unique identifiers from papers
|
||||
(DOIs, arXiv id etc).
|
||||
|
||||
Needs pdftotext and/or djvutxt installed on the machine.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from libbmc import doi, isbn
|
||||
from libbmc.repositories import arxiv, hal
|
||||
|
||||
|
||||
def find_identifiers(src):
|
||||
"""
|
||||
Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file.
|
||||
|
||||
.. note ::
|
||||
|
||||
This function returns the first matching identifier, that is the most
|
||||
likely to be relevant for this file. However, it may fail and return an
|
||||
identifier taken from the references or another paper.
|
||||
|
||||
:params src: Path to the file to scan.
|
||||
|
||||
:returns: a tuple (type, identifier) or ``None`` if not found or \
|
||||
an error occurred.
|
||||
"""
|
||||
if src.endswith(".pdf"):
|
||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
bufsize=1)
|
||||
elif src.endswith(".djvu"):
|
||||
totext = subprocess.Popen(["djvutxt", src],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
bufsize=1)
|
||||
else:
|
||||
return None
|
||||
|
||||
while totext.poll() is None:
|
||||
extract_full = ' '.join([i.decode("utf-8").strip()
|
||||
for i in totext.stdout.readlines()])
|
||||
found_isbn = isbn.extract_from_text(extract_full)
|
||||
if isbn:
|
||||
totext.terminate()
|
||||
return ("isbn", found_isbn)
|
||||
|
||||
found_doi = doi.extract_from_text(extract_full)
|
||||
if doi:
|
||||
totext.terminate()
|
||||
return ("doi", found_doi)
|
||||
|
||||
found_arxiv = arxiv.extract_from_text(extract_full)
|
||||
if arxiv:
|
||||
totext.terminate()
|
||||
return ("arxiv", found_arxiv)
|
||||
|
||||
found_hal = hal.extract_from_text(extract_full)
|
||||
if hal:
|
||||
totext.terminate()
|
||||
return ("hal", found_hal)
|
||||
|
||||
return None
|
@ -189,7 +189,7 @@ def is_valid(arxiv_id):
|
||||
|
||||
def get_bibtex(arxiv_id):
|
||||
"""
|
||||
Get a BibTeX entry for a given DOI.
|
||||
Get a BibTeX entry for a given arXiv ID.
|
||||
|
||||
.. note::
|
||||
|
||||
|
@ -1 +1,33 @@
|
||||
# TODO
|
||||
"""
|
||||
This file contains all the HAL-related functions.
|
||||
|
||||
TODO:
|
||||
* Add functions to homogeneize interface with arXiv one.
|
||||
"""
|
||||
import re
|
||||
|
||||
from libbmc import tools
|
||||
|
||||
|
||||
REGEX = re.compile(r"(hal-\d{8}), version (\d+)")
|
||||
|
||||
|
||||
def is_valid(hal_id):
|
||||
"""
|
||||
Check that a given HAL id is a valid one.
|
||||
|
||||
:param hal_id: The HAL id to be checked.
|
||||
:returns: Boolean indicating whether the HAL id is valid or not.
|
||||
"""
|
||||
match = REGEX.match(hal_id)
|
||||
return ((match is not None) and (match.group(0) == hal_id))
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
"""
|
||||
Extract HAL ids from a text.
|
||||
|
||||
:param text: The text to extract HAL ids from.
|
||||
:returns: A list of matching HAL ids.
|
||||
"""
|
||||
return tools.remove_duplicates(REGEX.findall(text))
|
||||
|
Loading…
Reference in New Issue
Block a user