Fix citation fetching from arXiv papers
This commit is contained in:
parent
975fd0f38f
commit
3eb251a8d4
@ -1,6 +1,8 @@
|
||||
"""
|
||||
This files contains all the functions to extract DOIs of citations from .bbl
|
||||
files.
|
||||
|
||||
# TODO: Unittests
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
This files contains all the functions to extract DOIs of citations from
|
||||
BibTeX files.
|
||||
|
||||
# TODO: Unittests
|
||||
"""
|
||||
import bibtexparser
|
||||
import os
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
This files contains all the functions to extract DOIs of citations from
|
||||
PDF files.
|
||||
|
||||
# TODO: Unittests
|
||||
"""
|
||||
import requests
|
||||
import subprocess
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
This files contains all the functions to extract DOIs of citations from
|
||||
plaintext files.
|
||||
|
||||
# TODO: Unittests
|
||||
"""
|
||||
import os
|
||||
import requests
|
||||
@ -57,6 +59,9 @@ def get_cited_DOIs(file):
|
||||
# It is either a path to a plaintext file or the content of a plaintext
|
||||
# file, we need some pre-processing to get a list of citations.
|
||||
plaintext_citations = get_plaintext_citations(file)
|
||||
else:
|
||||
# Else, we passed a list of plaintext citations.
|
||||
plaintext_citations = file
|
||||
dois = {}
|
||||
crossref_queue = []
|
||||
|
||||
@ -64,15 +69,15 @@ def get_cited_DOIs(file):
|
||||
for citation in plaintext_citations[:]:
|
||||
# Some citations already contain a DOI so try to match it directly
|
||||
matched_DOIs = doi.extract_from_text(citation)
|
||||
if matched_DOIs is not None:
|
||||
if len(matched_DOIs) > 0:
|
||||
# Add the DOI and go on
|
||||
dois[citation] = matched_DOIs[0]
|
||||
dois[citation] = next(iter(matched_DOIs))
|
||||
continue
|
||||
# Same thing for arXiv id
|
||||
matched_arXiv = arxiv.extract_from_text(citation)
|
||||
if matched_arXiv is not None:
|
||||
if len(matched_arXiv) > 0:
|
||||
# Add the associated DOI and go on
|
||||
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
|
||||
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
|
||||
continue
|
||||
# If no match found, stack it for next step
|
||||
# Note to remove URLs in the citation as the plaintext citations can
|
||||
@ -81,6 +86,7 @@ def get_cited_DOIs(file):
|
||||
|
||||
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
||||
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||
batch = [i for i in batch]
|
||||
try:
|
||||
# Fetch results from CrossRef
|
||||
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||
|
@ -3,6 +3,8 @@ This file contains various functions to fetch unique identifiers from papers
|
||||
(DOIs, arXiv id etc).
|
||||
|
||||
Needs pdftotext and/or djvutxt installed on the machine.
|
||||
|
||||
TODO: Unittests
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
This file contains the necessary functions to determine whether we should tear
|
||||
the first page from a PDF file, and actually tear it.
|
||||
|
||||
TODO: Unittests
|
||||
"""
|
||||
import tearpages
|
||||
|
||||
|
@ -374,6 +374,7 @@ def from_DOI(doi):
|
||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||
|
||||
>>> from_DOI('10.1209/0295-5075/111/40005')
|
||||
# Note: Test do not pass due to an arXiv API bug.
|
||||
'1506.06690'
|
||||
"""
|
||||
try:
|
||||
@ -490,7 +491,7 @@ def get_citations(arxiv_id):
|
||||
a canonical form.
|
||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||
|
||||
>>> get_citations("1506.06690")
|
||||
>>> get_citations("1401.2910")
|
||||
# TODO: Unittests
|
||||
"""
|
||||
dois = {}
|
||||
|
@ -87,7 +87,7 @@ def batch(iterable, size):
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
bi = islice(it, size)
|
||||
yield chain([bi.next()], bi)
|
||||
yield chain([next(bi)], bi)
|
||||
|
||||
|
||||
def remove_URLs(text):
|
||||
|
Loading…
x
Reference in New Issue
Block a user