Add identifiers fetching from papers
This commit is contained in:
parent
f688534e33
commit
ba564be738
64
libbmc/papers/identifiers.py
Normal file
64
libbmc/papers/identifiers.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
"""
|
||||||
|
This file contains various functions to fetch unique identifiers from papers
|
||||||
|
(DOIs, arXiv id etc).
|
||||||
|
|
||||||
|
Needs pdftotext and/or djvutxt installed on the machine.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from libbmc import doi, isbn
|
||||||
|
from libbmc.repositories import arxiv, hal
|
||||||
|
|
||||||
|
|
||||||
|
def find_identifiers(src):
|
||||||
|
"""
|
||||||
|
Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file.
|
||||||
|
|
||||||
|
.. note ::
|
||||||
|
|
||||||
|
This function returns the first matching identifier, that is the most
|
||||||
|
likely to be relevant for this file. However, it may fail and return an
|
||||||
|
identifier taken from the references or another paper.
|
||||||
|
|
||||||
|
:params src: Path to the file to scan.
|
||||||
|
|
||||||
|
:returns: a tuple (type, identifier) or ``None`` if not found or \
|
||||||
|
an error occurred.
|
||||||
|
"""
|
||||||
|
if src.endswith(".pdf"):
|
||||||
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
bufsize=1)
|
||||||
|
elif src.endswith(".djvu"):
|
||||||
|
totext = subprocess.Popen(["djvutxt", src],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
bufsize=1)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
while totext.poll() is None:
|
||||||
|
extract_full = ' '.join([i.decode("utf-8").strip()
|
||||||
|
for i in totext.stdout.readlines()])
|
||||||
|
found_isbn = isbn.extract_from_text(extract_full)
|
||||||
|
if isbn:
|
||||||
|
totext.terminate()
|
||||||
|
return ("isbn", found_isbn)
|
||||||
|
|
||||||
|
found_doi = doi.extract_from_text(extract_full)
|
||||||
|
if doi:
|
||||||
|
totext.terminate()
|
||||||
|
return ("doi", found_doi)
|
||||||
|
|
||||||
|
found_arxiv = arxiv.extract_from_text(extract_full)
|
||||||
|
if arxiv:
|
||||||
|
totext.terminate()
|
||||||
|
return ("arxiv", found_arxiv)
|
||||||
|
|
||||||
|
found_hal = hal.extract_from_text(extract_full)
|
||||||
|
if hal:
|
||||||
|
totext.terminate()
|
||||||
|
return ("hal", found_hal)
|
||||||
|
|
||||||
|
return None
|
@ -189,7 +189,7 @@ def is_valid(arxiv_id):
|
|||||||
|
|
||||||
def get_bibtex(arxiv_id):
|
def get_bibtex(arxiv_id):
|
||||||
"""
|
"""
|
||||||
Get a BibTeX entry for a given DOI.
|
Get a BibTeX entry for a given arXiv ID.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
@ -1 +1,33 @@
|
|||||||
# TODO
|
"""
|
||||||
|
This file contains all the HAL-related functions.
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
* Add functions to homogeneize interface with arXiv one.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
from libbmc import tools
|
||||||
|
|
||||||
|
|
||||||
|
REGEX = re.compile(r"(hal-\d{8}), version (\d+)")
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid(hal_id):
|
||||||
|
"""
|
||||||
|
Check that a given HAL id is a valid one.
|
||||||
|
|
||||||
|
:param hal_id: The HAL id to be checked.
|
||||||
|
:returns: Boolean indicating whether the HAL id is valid or not.
|
||||||
|
"""
|
||||||
|
match = REGEX.match(hal_id)
|
||||||
|
return ((match is not None) and (match.group(0) == hal_id))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_text(text):
|
||||||
|
"""
|
||||||
|
Extract HAL ids from a text.
|
||||||
|
|
||||||
|
:param text: The text to extract HAL ids from.
|
||||||
|
:returns: A list of matching HAL ids.
|
||||||
|
"""
|
||||||
|
return tools.remove_duplicates(REGEX.findall(text))
|
||||||
|
Loading…
Reference in New Issue
Block a user