From ba564be738bf594fea8621eebffc75923c638953 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Sun, 10 Jan 2016 15:12:06 +0100 Subject: [PATCH] Add identifiers fetching from papers --- libbmc/papers/identifiers.py | 64 ++++++++++++++++++++++++++++++++++++ libbmc/repositories/arxiv.py | 2 +- libbmc/repositories/hal.py | 34 ++++++++++++++++++- 3 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 libbmc/papers/identifiers.py diff --git a/libbmc/papers/identifiers.py b/libbmc/papers/identifiers.py new file mode 100644 index 0000000..a0c408d --- /dev/null +++ b/libbmc/papers/identifiers.py @@ -0,0 +1,64 @@ +""" +This file contains various functions to fetch unique identifiers from papers +(DOIs, arXiv id etc). + +Needs pdftotext and/or djvutxt installed on the machine. +""" +import subprocess + +from libbmc import doi, isbn +from libbmc.repositories import arxiv, hal + + +def find_identifiers(src): + """ + Search for a valid identifier (DOI, ISBN, arXiv, HAL) in a given file. + + .. note :: + + This function returns the first matching identifier, that is the most + likely to be relevant for this file. However, it may fail and return an + identifier taken from the references or another paper. + + :params src: Path to the file to scan. + + :returns: a tuple (type, identifier) or ``None`` if not found or \ + an error occurred. + """ + if src.endswith(".pdf"): + totext = subprocess.Popen(["pdftotext", src, "-"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=1) + elif src.endswith(".djvu"): + totext = subprocess.Popen(["djvutxt", src], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=1) + else: + return None + + while totext.poll() is None: + extract_full = ' '.join([i.decode("utf-8").strip() + for i in totext.stdout.readlines()]) + found_isbn = isbn.extract_from_text(extract_full) + if isbn: + totext.terminate() + return ("isbn", found_isbn) + + found_doi = doi.extract_from_text(extract_full) + if doi: + totext.terminate() + return ("doi", found_doi) + + found_arxiv = arxiv.extract_from_text(extract_full) + if arxiv: + totext.terminate() + return ("arxiv", found_arxiv) + + found_hal = hal.extract_from_text(extract_full) + if hal: + totext.terminate() + return ("hal", found_hal) + + return None diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py index 7862be5..acf6a6f 100644 --- a/libbmc/repositories/arxiv.py +++ b/libbmc/repositories/arxiv.py @@ -189,7 +189,7 @@ def is_valid(arxiv_id): def get_bibtex(arxiv_id): """ - Get a BibTeX entry for a given DOI. + Get a BibTeX entry for a given arXiv ID. .. note:: diff --git a/libbmc/repositories/hal.py b/libbmc/repositories/hal.py index 4640904..5e36165 100644 --- a/libbmc/repositories/hal.py +++ b/libbmc/repositories/hal.py @@ -1 +1,33 @@ -# TODO +""" +This file contains all the HAL-related functions. + +TODO: + * Add functions to homogeneize interface with arXiv one. +""" +import re + +from libbmc import tools + + +REGEX = re.compile(r"(hal-\d{8}), version (\d+)") + + +def is_valid(hal_id): + """ + Check that a given HAL id is a valid one. + + :param hal_id: The HAL id to be checked. + :returns: Boolean indicating whether the HAL id is valid or not. + """ + match = REGEX.match(hal_id) + return ((match is not None) and (match.group(0) == hal_id)) + + +def extract_from_text(text): + """ + Extract HAL ids from a text. + + :param text: The text to extract HAL ids from. + :returns: A list of matching HAL ids. + """ + return tools.remove_duplicates(REGEX.findall(text))