Fix citation fetching from arXiv papers

This commit is contained in:
Lucas Verney 2016-01-20 23:40:07 +01:00
parent 975fd0f38f
commit 3eb251a8d4
8 changed files with 23 additions and 6 deletions

View File

@ -1,6 +1,8 @@
""" """
This files contains all the functions to extract DOIs of citations from .bbl This files contains all the functions to extract DOIs of citations from .bbl
files. files.
# TODO: Unittests
""" """
import os import os
import re import re

View File

@ -1,6 +1,8 @@
""" """
This files contains all the functions to extract DOIs of citations from This files contains all the functions to extract DOIs of citations from
BibTeX files. BibTeX files.
# TODO: Unittests
""" """
import bibtexparser import bibtexparser
import os import os

View File

@ -1,6 +1,8 @@
""" """
This files contains all the functions to extract DOIs of citations from This files contains all the functions to extract DOIs of citations from
PDF files. PDF files.
# TODO: Unittests
""" """
import requests import requests
import subprocess import subprocess

View File

@ -1,6 +1,8 @@
""" """
This files contains all the functions to extract DOIs of citations from This files contains all the functions to extract DOIs of citations from
plaintext files. plaintext files.
# TODO: Unittests
""" """
import os import os
import requests import requests
@ -57,6 +59,9 @@ def get_cited_DOIs(file):
# It is either a path to a plaintext file or the content of a plaintext # It is either a path to a plaintext file or the content of a plaintext
# file, we need some pre-processing to get a list of citations. # file, we need some pre-processing to get a list of citations.
plaintext_citations = get_plaintext_citations(file) plaintext_citations = get_plaintext_citations(file)
else:
# Else, we passed a list of plaintext citations.
plaintext_citations = file
dois = {} dois = {}
crossref_queue = [] crossref_queue = []
@ -64,15 +69,15 @@ def get_cited_DOIs(file):
for citation in plaintext_citations[:]: for citation in plaintext_citations[:]:
# Some citations already contain a DOI so try to match it directly # Some citations already contain a DOI so try to match it directly
matched_DOIs = doi.extract_from_text(citation) matched_DOIs = doi.extract_from_text(citation)
if matched_DOIs is not None: if len(matched_DOIs) > 0:
# Add the DOI and go on # Add the DOI and go on
dois[citation] = matched_DOIs[0] dois[citation] = next(iter(matched_DOIs))
continue continue
# Same thing for arXiv id # Same thing for arXiv id
matched_arXiv = arxiv.extract_from_text(citation) matched_arXiv = arxiv.extract_from_text(citation)
if matched_arXiv is not None: if len(matched_arXiv) > 0:
# Add the associated DOI and go on # Add the associated DOI and go on
dois[citation] = arxiv.to_DOI(matched_arXiv[0]) dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
continue continue
# If no match found, stack it for next step # If no match found, stack it for next step
# Note to remove URLs in the citation as the plaintext citations can # Note to remove URLs in the citation as the plaintext citations can
@ -81,6 +86,7 @@ def get_cited_DOIs(file):
# Do batch with remaining papers, to prevent from the timeout of CrossRef # Do batch with remaining papers, to prevent from the timeout of CrossRef
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
batch = [i for i in batch]
try: try:
# Fetch results from CrossRef # Fetch results from CrossRef
r = requests.post(CROSSREF_LINKS_API_URL, json=batch) r = requests.post(CROSSREF_LINKS_API_URL, json=batch)

View File

@ -3,6 +3,8 @@ This file contains various functions to fetch unique identifiers from papers
(DOIs, arXiv id etc). (DOIs, arXiv id etc).
Needs pdftotext and/or djvutxt installed on the machine. Needs pdftotext and/or djvutxt installed on the machine.
TODO: Unittests
""" """
import subprocess import subprocess

View File

@ -1,6 +1,8 @@
""" """
This file contains the necessary functions to determine whether we should tear This file contains the necessary functions to determine whether we should tear
the first page from a PDF file, and actually tear it. the first page from a PDF file, and actually tear it.
TODO: Unittests
""" """
import tearpages import tearpages

View File

@ -374,6 +374,7 @@ def from_DOI(doi):
:returns: The arXiv eprint id, or ``None`` if not found. :returns: The arXiv eprint id, or ``None`` if not found.
>>> from_DOI('10.1209/0295-5075/111/40005') >>> from_DOI('10.1209/0295-5075/111/40005')
# Note: Test do not pass due to an arXiv API bug.
'1506.06690' '1506.06690'
""" """
try: try:
@ -490,7 +491,7 @@ def get_citations(arxiv_id):
a canonical form. a canonical form.
:returns: A dict of cleaned plaintext citations and their associated DOI. :returns: A dict of cleaned plaintext citations and their associated DOI.
>>> get_citations("1506.06690") >>> get_citations("1401.2910")
# TODO: Unittests # TODO: Unittests
""" """
dois = {} dois = {}

View File

@ -87,7 +87,7 @@ def batch(iterable, size):
it = iter(iterable) it = iter(iterable)
while True: while True:
bi = islice(it, size) bi = islice(it, size)
yield chain([bi.next()], bi) yield chain([next(bi)], bi)
def remove_URLs(text): def remove_URLs(text):