Fix citation fetching from arXiv papers
This commit is contained in:
parent
975fd0f38f
commit
3eb251a8d4
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
This files contains all the functions to extract DOIs of citations from .bbl
|
This files contains all the functions to extract DOIs of citations from .bbl
|
||||||
files.
|
files.
|
||||||
|
|
||||||
|
# TODO: Unittests
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
This files contains all the functions to extract DOIs of citations from
|
This files contains all the functions to extract DOIs of citations from
|
||||||
BibTeX files.
|
BibTeX files.
|
||||||
|
|
||||||
|
# TODO: Unittests
|
||||||
"""
|
"""
|
||||||
import bibtexparser
|
import bibtexparser
|
||||||
import os
|
import os
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
This files contains all the functions to extract DOIs of citations from
|
This files contains all the functions to extract DOIs of citations from
|
||||||
PDF files.
|
PDF files.
|
||||||
|
|
||||||
|
# TODO: Unittests
|
||||||
"""
|
"""
|
||||||
import requests
|
import requests
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
This files contains all the functions to extract DOIs of citations from
|
This files contains all the functions to extract DOIs of citations from
|
||||||
plaintext files.
|
plaintext files.
|
||||||
|
|
||||||
|
# TODO: Unittests
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
@ -57,6 +59,9 @@ def get_cited_DOIs(file):
|
|||||||
# It is either a path to a plaintext file or the content of a plaintext
|
# It is either a path to a plaintext file or the content of a plaintext
|
||||||
# file, we need some pre-processing to get a list of citations.
|
# file, we need some pre-processing to get a list of citations.
|
||||||
plaintext_citations = get_plaintext_citations(file)
|
plaintext_citations = get_plaintext_citations(file)
|
||||||
|
else:
|
||||||
|
# Else, we passed a list of plaintext citations.
|
||||||
|
plaintext_citations = file
|
||||||
dois = {}
|
dois = {}
|
||||||
crossref_queue = []
|
crossref_queue = []
|
||||||
|
|
||||||
@ -64,15 +69,15 @@ def get_cited_DOIs(file):
|
|||||||
for citation in plaintext_citations[:]:
|
for citation in plaintext_citations[:]:
|
||||||
# Some citations already contain a DOI so try to match it directly
|
# Some citations already contain a DOI so try to match it directly
|
||||||
matched_DOIs = doi.extract_from_text(citation)
|
matched_DOIs = doi.extract_from_text(citation)
|
||||||
if matched_DOIs is not None:
|
if len(matched_DOIs) > 0:
|
||||||
# Add the DOI and go on
|
# Add the DOI and go on
|
||||||
dois[citation] = matched_DOIs[0]
|
dois[citation] = next(iter(matched_DOIs))
|
||||||
continue
|
continue
|
||||||
# Same thing for arXiv id
|
# Same thing for arXiv id
|
||||||
matched_arXiv = arxiv.extract_from_text(citation)
|
matched_arXiv = arxiv.extract_from_text(citation)
|
||||||
if matched_arXiv is not None:
|
if len(matched_arXiv) > 0:
|
||||||
# Add the associated DOI and go on
|
# Add the associated DOI and go on
|
||||||
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
|
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
|
||||||
continue
|
continue
|
||||||
# If no match found, stack it for next step
|
# If no match found, stack it for next step
|
||||||
# Note to remove URLs in the citation as the plaintext citations can
|
# Note to remove URLs in the citation as the plaintext citations can
|
||||||
@ -81,6 +86,7 @@ def get_cited_DOIs(file):
|
|||||||
|
|
||||||
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
||||||
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||||
|
batch = [i for i in batch]
|
||||||
try:
|
try:
|
||||||
# Fetch results from CrossRef
|
# Fetch results from CrossRef
|
||||||
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||||
|
@ -3,6 +3,8 @@ This file contains various functions to fetch unique identifiers from papers
|
|||||||
(DOIs, arXiv id etc).
|
(DOIs, arXiv id etc).
|
||||||
|
|
||||||
Needs pdftotext and/or djvutxt installed on the machine.
|
Needs pdftotext and/or djvutxt installed on the machine.
|
||||||
|
|
||||||
|
TODO: Unittests
|
||||||
"""
|
"""
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
This file contains the necessary functions to determine whether we should tear
|
This file contains the necessary functions to determine whether we should tear
|
||||||
the first page from a PDF file, and actually tear it.
|
the first page from a PDF file, and actually tear it.
|
||||||
|
|
||||||
|
TODO: Unittests
|
||||||
"""
|
"""
|
||||||
import tearpages
|
import tearpages
|
||||||
|
|
||||||
|
@ -374,6 +374,7 @@ def from_DOI(doi):
|
|||||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||||
|
|
||||||
>>> from_DOI('10.1209/0295-5075/111/40005')
|
>>> from_DOI('10.1209/0295-5075/111/40005')
|
||||||
|
# Note: Test do not pass due to an arXiv API bug.
|
||||||
'1506.06690'
|
'1506.06690'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@ -490,7 +491,7 @@ def get_citations(arxiv_id):
|
|||||||
a canonical form.
|
a canonical form.
|
||||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||||
|
|
||||||
>>> get_citations("1506.06690")
|
>>> get_citations("1401.2910")
|
||||||
# TODO: Unittests
|
# TODO: Unittests
|
||||||
"""
|
"""
|
||||||
dois = {}
|
dois = {}
|
||||||
|
@ -87,7 +87,7 @@ def batch(iterable, size):
|
|||||||
it = iter(iterable)
|
it = iter(iterable)
|
||||||
while True:
|
while True:
|
||||||
bi = islice(it, size)
|
bi = islice(it, size)
|
||||||
yield chain([bi.next()], bi)
|
yield chain([next(bi)], bi)
|
||||||
|
|
||||||
|
|
||||||
def remove_URLs(text):
|
def remove_URLs(text):
|
||||||
|
Loading…
Reference in New Issue
Block a user