From 3eb251a8d4e67022ffacf5fbdfcee09d80be5a1f Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Wed, 20 Jan 2016 23:40:07 +0100 Subject: [PATCH] Fix citation fetching from arXiv papers --- libbmc/citations/bbl.py | 2 ++ libbmc/citations/bibtex.py | 2 ++ libbmc/citations/pdf.py | 2 ++ libbmc/citations/plaintext.py | 14 ++++++++++---- libbmc/papers/identifiers.py | 2 ++ libbmc/papers/tearpages.py | 2 ++ libbmc/repositories/arxiv.py | 3 ++- libbmc/tools.py | 2 +- 8 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py index 3fde99e..66c2ada 100644 --- a/libbmc/citations/bbl.py +++ b/libbmc/citations/bbl.py @@ -1,6 +1,8 @@ """ This files contains all the functions to extract DOIs of citations from .bbl files. + +# TODO: Unittests """ import os import re diff --git a/libbmc/citations/bibtex.py b/libbmc/citations/bibtex.py index 4c357d8..4441b84 100644 --- a/libbmc/citations/bibtex.py +++ b/libbmc/citations/bibtex.py @@ -1,6 +1,8 @@ """ This files contains all the functions to extract DOIs of citations from BibTeX files. + +# TODO: Unittests """ import bibtexparser import os diff --git a/libbmc/citations/pdf.py b/libbmc/citations/pdf.py index 90adbe9..df7ae2b 100644 --- a/libbmc/citations/pdf.py +++ b/libbmc/citations/pdf.py @@ -1,6 +1,8 @@ """ This files contains all the functions to extract DOIs of citations from PDF files. + +# TODO: Unittests """ import requests import subprocess diff --git a/libbmc/citations/plaintext.py b/libbmc/citations/plaintext.py index f5a36cb..8f1ad32 100644 --- a/libbmc/citations/plaintext.py +++ b/libbmc/citations/plaintext.py @@ -1,6 +1,8 @@ """ This files contains all the functions to extract DOIs of citations from plaintext files. + +# TODO: Unittests """ import os import requests @@ -57,6 +59,9 @@ def get_cited_DOIs(file): # It is either a path to a plaintext file or the content of a plaintext # file, we need some pre-processing to get a list of citations. plaintext_citations = get_plaintext_citations(file) + else: + # Else, we passed a list of plaintext citations. + plaintext_citations = file dois = {} crossref_queue = [] @@ -64,15 +69,15 @@ def get_cited_DOIs(file): for citation in plaintext_citations[:]: # Some citations already contain a DOI so try to match it directly matched_DOIs = doi.extract_from_text(citation) - if matched_DOIs is not None: + if len(matched_DOIs) > 0: # Add the DOI and go on - dois[citation] = matched_DOIs[0] + dois[citation] = next(iter(matched_DOIs)) continue # Same thing for arXiv id matched_arXiv = arxiv.extract_from_text(citation) - if matched_arXiv is not None: + if len(matched_arXiv) > 0: # Add the associated DOI and go on - dois[citation] = arxiv.to_DOI(matched_arXiv[0]) + dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv))) continue # If no match found, stack it for next step # Note to remove URLs in the citation as the plaintext citations can @@ -81,6 +86,7 @@ def get_cited_DOIs(file): # Do batch with remaining papers, to prevent from the timeout of CrossRef for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): + batch = [i for i in batch] try: # Fetch results from CrossRef r = requests.post(CROSSREF_LINKS_API_URL, json=batch) diff --git a/libbmc/papers/identifiers.py b/libbmc/papers/identifiers.py index f3083fd..32e340f 100644 --- a/libbmc/papers/identifiers.py +++ b/libbmc/papers/identifiers.py @@ -3,6 +3,8 @@ This file contains various functions to fetch unique identifiers from papers (DOIs, arXiv id etc). Needs pdftotext and/or djvutxt installed on the machine. + +TODO: Unittests """ import subprocess diff --git a/libbmc/papers/tearpages.py b/libbmc/papers/tearpages.py index 9fdfcd6..3cd2f24 100644 --- a/libbmc/papers/tearpages.py +++ b/libbmc/papers/tearpages.py @@ -1,6 +1,8 @@ """ This file contains the necessary functions to determine whether we should tear the first page from a PDF file, and actually tear it. + +TODO: Unittests """ import tearpages diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py index 7638343..1b0b97c 100644 --- a/libbmc/repositories/arxiv.py +++ b/libbmc/repositories/arxiv.py @@ -374,6 +374,7 @@ def from_DOI(doi): :returns: The arXiv eprint id, or ``None`` if not found. >>> from_DOI('10.1209/0295-5075/111/40005') + # Note: Test do not pass due to an arXiv API bug. '1506.06690' """ try: @@ -490,7 +491,7 @@ def get_citations(arxiv_id): a canonical form. :returns: A dict of cleaned plaintext citations and their associated DOI. - >>> get_citations("1506.06690") + >>> get_citations("1401.2910") # TODO: Unittests """ dois = {} diff --git a/libbmc/tools.py b/libbmc/tools.py index aa6d0be..3e6e9ed 100644 --- a/libbmc/tools.py +++ b/libbmc/tools.py @@ -87,7 +87,7 @@ def batch(iterable, size): it = iter(iterable) while True: bi = islice(it, size) - yield chain([bi.next()], bi) + yield chain([next(bi)], bi) def remove_URLs(text):