Fix citation fetching from arXiv papers

This commit is contained in:
Lucas Verney 2016-01-20 23:40:07 +01:00
parent 975fd0f38f
commit 3eb251a8d4
8 changed files with 23 additions and 6 deletions

View File

@ -1,6 +1,8 @@
"""
This files contains all the functions to extract DOIs of citations from .bbl
files.
# TODO: Unittests
"""
import os
import re

View File

@ -1,6 +1,8 @@
"""
This files contains all the functions to extract DOIs of citations from
BibTeX files.
# TODO: Unittests
"""
import bibtexparser
import os

View File

@ -1,6 +1,8 @@
"""
This files contains all the functions to extract DOIs of citations from
PDF files.
# TODO: Unittests
"""
import requests
import subprocess

View File

@ -1,6 +1,8 @@
"""
This files contains all the functions to extract DOIs of citations from
plaintext files.
# TODO: Unittests
"""
import os
import requests
@ -57,6 +59,9 @@ def get_cited_DOIs(file):
# It is either a path to a plaintext file or the content of a plaintext
# file, we need some pre-processing to get a list of citations.
plaintext_citations = get_plaintext_citations(file)
else:
# Else, we passed a list of plaintext citations.
plaintext_citations = file
dois = {}
crossref_queue = []
@ -64,15 +69,15 @@ def get_cited_DOIs(file):
for citation in plaintext_citations[:]:
# Some citations already contain a DOI so try to match it directly
matched_DOIs = doi.extract_from_text(citation)
if matched_DOIs is not None:
if len(matched_DOIs) > 0:
# Add the DOI and go on
dois[citation] = matched_DOIs[0]
dois[citation] = next(iter(matched_DOIs))
continue
# Same thing for arXiv id
matched_arXiv = arxiv.extract_from_text(citation)
if matched_arXiv is not None:
if len(matched_arXiv) > 0:
# Add the associated DOI and go on
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
continue
# If no match found, stack it for next step
# Note to remove URLs in the citation as the plaintext citations can
@ -81,6 +86,7 @@ def get_cited_DOIs(file):
# Do batch with remaining papers, to prevent from the timeout of CrossRef
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
batch = [i for i in batch]
try:
# Fetch results from CrossRef
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)

View File

@ -3,6 +3,8 @@ This file contains various functions to fetch unique identifiers from papers
(DOIs, arXiv id etc).
Needs pdftotext and/or djvutxt installed on the machine.
TODO: Unittests
"""
import subprocess

View File

@ -1,6 +1,8 @@
"""
This file contains the necessary functions to determine whether we should tear
the first page from a PDF file, and actually tear it.
TODO: Unittests
"""
import tearpages

View File

@ -374,6 +374,7 @@ def from_DOI(doi):
:returns: The arXiv eprint id, or ``None`` if not found.
>>> from_DOI('10.1209/0295-5075/111/40005')
# Note: Test do not pass due to an arXiv API bug.
'1506.06690'
"""
try:
@ -490,7 +491,7 @@ def get_citations(arxiv_id):
a canonical form.
:returns: A dict of cleaned plaintext citations and their associated DOI.
>>> get_citations("1506.06690")
>>> get_citations("1401.2910")
# TODO: Unittests
"""
dois = {}

View File

@ -87,7 +87,7 @@ def batch(iterable, size):
it = iter(iterable)
while True:
bi = islice(it, size)
yield chain([bi.next()], bi)
yield chain([next(bi)], bi)
def remove_URLs(text):