From 3eb251a8d4e67022ffacf5fbdfcee09d80be5a1f Mon Sep 17 00:00:00 2001
From: "Phyks (Lucas Verney)" <phyks@phyks.me>
Date: Wed, 20 Jan 2016 23:40:07 +0100
Subject: [PATCH] Fix citation fetching from arXiv papers

---
 libbmc/citations/bbl.py       |  2 ++
 libbmc/citations/bibtex.py    |  2 ++
 libbmc/citations/pdf.py       |  2 ++
 libbmc/citations/plaintext.py | 14 ++++++++++----
 libbmc/papers/identifiers.py  |  2 ++
 libbmc/papers/tearpages.py    |  2 ++
 libbmc/repositories/arxiv.py  |  3 ++-
 libbmc/tools.py               |  2 +-
 8 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py
index 3fde99e..66c2ada 100644
--- a/libbmc/citations/bbl.py
+++ b/libbmc/citations/bbl.py
@@ -1,6 +1,8 @@
 """
 This files contains all the functions to extract DOIs of citations from .bbl
 files.
+
+# TODO: Unittests
 """
 import os
 import re
diff --git a/libbmc/citations/bibtex.py b/libbmc/citations/bibtex.py
index 4c357d8..4441b84 100644
--- a/libbmc/citations/bibtex.py
+++ b/libbmc/citations/bibtex.py
@@ -1,6 +1,8 @@
 """
 This files contains all the functions to extract DOIs of citations from
 BibTeX files.
+
+# TODO: Unittests
 """
 import bibtexparser
 import os
diff --git a/libbmc/citations/pdf.py b/libbmc/citations/pdf.py
index 90adbe9..df7ae2b 100644
--- a/libbmc/citations/pdf.py
+++ b/libbmc/citations/pdf.py
@@ -1,6 +1,8 @@
 """
 This files contains all the functions to extract DOIs of citations from
 PDF files.
+
+# TODO: Unittests
 """
 import requests
 import subprocess
diff --git a/libbmc/citations/plaintext.py b/libbmc/citations/plaintext.py
index f5a36cb..8f1ad32 100644
--- a/libbmc/citations/plaintext.py
+++ b/libbmc/citations/plaintext.py
@@ -1,6 +1,8 @@
 """
 This files contains all the functions to extract DOIs of citations from
 plaintext files.
+
+# TODO: Unittests
 """
 import os
 import requests
@@ -57,6 +59,9 @@ def get_cited_DOIs(file):
         # It is either a path to a plaintext file or the content of a plaintext
         # file, we need some pre-processing to get a list of citations.
         plaintext_citations = get_plaintext_citations(file)
+    else:
+        # Else, we passed a list of plaintext citations.
+        plaintext_citations = file
     dois = {}
     crossref_queue = []
 
@@ -64,15 +69,15 @@ def get_cited_DOIs(file):
     for citation in plaintext_citations[:]:
         # Some citations already contain a DOI so try to match it directly
         matched_DOIs = doi.extract_from_text(citation)
-        if matched_DOIs is not None:
+        if len(matched_DOIs) > 0:
             # Add the DOI and go on
-            dois[citation] = matched_DOIs[0]
+            dois[citation] = next(iter(matched_DOIs))
             continue
         # Same thing for arXiv id
         matched_arXiv = arxiv.extract_from_text(citation)
-        if matched_arXiv is not None:
+        if len(matched_arXiv) > 0:
             # Add the associated DOI and go on
-            dois[citation] = arxiv.to_DOI(matched_arXiv[0])
+            dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
             continue
         # If no match found, stack it for next step
         # Note to remove URLs in the citation as the plaintext citations can
@@ -81,6 +86,7 @@ def get_cited_DOIs(file):
 
     # Do batch with remaining papers, to prevent from the timeout of CrossRef
     for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
+        batch = [i for i in batch]
         try:
             # Fetch results from CrossRef
             r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
diff --git a/libbmc/papers/identifiers.py b/libbmc/papers/identifiers.py
index f3083fd..32e340f 100644
--- a/libbmc/papers/identifiers.py
+++ b/libbmc/papers/identifiers.py
@@ -3,6 +3,8 @@ This file contains various functions to fetch unique identifiers from papers
 (DOIs, arXiv id etc).
 
 Needs pdftotext and/or djvutxt installed on the machine.
+
+TODO: Unittests
 """
 import subprocess
 
diff --git a/libbmc/papers/tearpages.py b/libbmc/papers/tearpages.py
index 9fdfcd6..3cd2f24 100644
--- a/libbmc/papers/tearpages.py
+++ b/libbmc/papers/tearpages.py
@@ -1,6 +1,8 @@
 """
 This file contains the necessary functions to determine whether we should tear
 the first page from a PDF file, and actually tear it.
+
+TODO: Unittests
 """
 import tearpages
 
diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py
index 7638343..1b0b97c 100644
--- a/libbmc/repositories/arxiv.py
+++ b/libbmc/repositories/arxiv.py
@@ -374,6 +374,7 @@ def from_DOI(doi):
     :returns: The arXiv eprint id, or ``None`` if not found.
 
     >>> from_DOI('10.1209/0295-5075/111/40005')
+    # Note: Test do not pass due to an arXiv API bug.
     '1506.06690'
     """
     try:
@@ -490,7 +491,7 @@ def get_citations(arxiv_id):
             a canonical form.
     :returns: A dict of cleaned plaintext citations and their associated DOI.
 
-    >>> get_citations("1506.06690")
+    >>> get_citations("1401.2910")
     # TODO: Unittests
     """
     dois = {}
diff --git a/libbmc/tools.py b/libbmc/tools.py
index aa6d0be..3e6e9ed 100644
--- a/libbmc/tools.py
+++ b/libbmc/tools.py
@@ -87,7 +87,7 @@ def batch(iterable, size):
     it = iter(iterable)
     while True:
         bi = islice(it, size)
-        yield chain([bi.next()], bi)
+        yield chain([next(bi)], bi)
 
 
 def remove_URLs(text):