""" This file contains various functions to fetch unique identifiers from papers (DOIs, arXiv id etc). Needs pdftotext and/or djvutxt installed on the machine. """ import subprocess from libbmc import doi, isbn from libbmc.repositories import arxiv, hal def find_identifiers(src): """ Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file. .. note :: This function returns the first matching identifier, that is the most likely to be relevant for this file. However, it may fail and return an identifier taken from the references or another paper. :params src: Path to the file to scan. :returns: a tuple (type, identifier) or ``None`` if not found or \ an error occurred. """ if src.endswith(".pdf"): totext = subprocess.Popen(["pdftotext", src, "-"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) elif src.endswith(".djvu"): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) else: return None while totext.poll() is None: extract_full = ' '.join([i.decode("utf-8").strip() for i in totext.stdout.readlines()]) found_isbn = isbn.extract_from_text(extract_full) if isbn: totext.terminate() return ("isbn", found_isbn) found_doi = doi.extract_from_text(extract_full) if doi: totext.terminate() return ("doi", found_doi) found_arxiv = arxiv.extract_from_text(extract_full) if arxiv: totext.terminate() return ("arxiv", found_arxiv) found_hal = hal.extract_from_text(extract_full) if hal: totext.terminate() return ("hal", found_hal) return None