Add a citation fetcher for plaintext, and factorize code with bbl citations fetcher
This commit is contained in:
parent
bd0016cb51
commit
0d17254f6c
@ -1,16 +1,13 @@
|
|||||||
"""
|
"""
|
||||||
This files contains all the functions to deal with .bbl files.
|
This files contains all the functions to extract DOIs of citations from .bbl
|
||||||
|
files.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import requests
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from requests.exception import RequestException
|
|
||||||
|
|
||||||
from libbmc import doi
|
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
from libbmc.repositories import arxiv
|
from libbmc.citations import bbl
|
||||||
|
|
||||||
|
|
||||||
# Regex to match bibitems
|
# Regex to match bibitems
|
||||||
@ -19,11 +16,6 @@ BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
|
|||||||
ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
|
ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
|
||||||
|
|
||||||
|
|
||||||
# CrossRef API URL
|
|
||||||
CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
|
|
||||||
CROSSREF_MAX_BATCH_SIZE = 10
|
|
||||||
|
|
||||||
|
|
||||||
def bibitem_as_plaintext(bibitem):
|
def bibitem_as_plaintext(bibitem):
|
||||||
"""
|
"""
|
||||||
Return a plaintext representation of the bibitem from the ``.bbl`` file.
|
Return a plaintext representation of the bibitem from the ``.bbl`` file.
|
||||||
@ -75,51 +67,16 @@ def get_plaintext_citations(bbl):
|
|||||||
return cleaned_bbl
|
return cleaned_bbl
|
||||||
|
|
||||||
|
|
||||||
def get_cited_DOIs(bbl):
|
def get_cited_DOIs(bbl_input):
|
||||||
"""
|
"""
|
||||||
Get the DOIs of the papers cited in this .bbl file.
|
Get the DOIs of the papers cited in this .bbl file.
|
||||||
|
|
||||||
:param bbl: Either the path to a .bbl file or the content of a .bbl file.
|
:param bbl_input: Either the path to a .bbl file or the content \
|
||||||
|
of a .bbl file.
|
||||||
|
|
||||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||||
"""
|
"""
|
||||||
dois = {}
|
|
||||||
crossref_queue = []
|
|
||||||
# Get the plaintext citations from the bbl file
|
# Get the plaintext citations from the bbl file
|
||||||
plaintext_citations = get_plaintext_citations(bbl)
|
plaintext_citations = get_plaintext_citations(bbl_input)
|
||||||
# Try to get the DOI directly from the citation
|
# Use the plaintext citations parser on these citations
|
||||||
for citation in plaintext_citations[:]:
|
return bbl.get_cited_DOIs(plaintext_citations)
|
||||||
# Some citations already contain a DOI so try to match it directly
|
|
||||||
matched_DOIs = doi.extract_from_text(citation)
|
|
||||||
if matched_DOIs is not None:
|
|
||||||
# Add the DOI and go on
|
|
||||||
dois[citation] = matched_DOIs[0]
|
|
||||||
continue
|
|
||||||
# Same thing for arXiv id
|
|
||||||
matched_arXiv = arxiv.extract_from_text(citation)
|
|
||||||
if matched_arXiv is not None:
|
|
||||||
# Add the associated DOI and go on
|
|
||||||
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
|
|
||||||
continue
|
|
||||||
# If no match found, stack it for next step
|
|
||||||
# Note to remove URLs in the citation as the plaintext citations can
|
|
||||||
# contain URLs and they are bad for the CrossRef API.
|
|
||||||
crossref_queue.append(tools.remove_URLs(citation))
|
|
||||||
# Do batch of papers, to prevent from the timeout of crossref
|
|
||||||
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
|
||||||
try:
|
|
||||||
# Fetch results from CrossRef
|
|
||||||
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
|
||||||
for result in r.json()["results"]:
|
|
||||||
# Try to get a DOI
|
|
||||||
try:
|
|
||||||
dois[result["text"]] = result["doi"]
|
|
||||||
except KeyError:
|
|
||||||
# Or set it to None
|
|
||||||
dois[result["text"]] = None
|
|
||||||
except (RequestException, ValueError, KeyError):
|
|
||||||
# If an exception occurred, set all the DOIs to None for the
|
|
||||||
# current batch
|
|
||||||
for i in batch:
|
|
||||||
dois[i] = None
|
|
||||||
return dois
|
|
||||||
|
99
libbmc/citations/plaintext.py
Normal file
99
libbmc/citations/plaintext.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
"""
|
||||||
|
This files contains all the functions to extract DOIs of citations from
|
||||||
|
plaintext files.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from requests.exception import RequestException
|
||||||
|
|
||||||
|
from libbmc import doi
|
||||||
|
from libbmc import tools
|
||||||
|
from libbmc.repositories import arxiv
|
||||||
|
|
||||||
|
|
||||||
|
# CrossRef API URL
|
||||||
|
CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
|
||||||
|
CROSSREF_MAX_BATCH_SIZE = 10
|
||||||
|
|
||||||
|
|
||||||
|
def get_plaintext_citations(file):
|
||||||
|
"""
|
||||||
|
Parse a plaintext file to get a clean list of plaintext citations. The \
|
||||||
|
file should have one citation per line.
|
||||||
|
|
||||||
|
:param file: Either the path to the plaintext file or the content of a \
|
||||||
|
plaintext file.
|
||||||
|
:returns: A list of cleaned plaintext citations.
|
||||||
|
"""
|
||||||
|
# Handle path or content
|
||||||
|
if os.path.isfile(file):
|
||||||
|
with open(file, 'r') as fh:
|
||||||
|
content = fh.readlines()
|
||||||
|
else:
|
||||||
|
content = file.splitlines()
|
||||||
|
# Clean every line to have plaintext
|
||||||
|
cleaned_citations = [tools.clean_whitespaces(line) for line in content]
|
||||||
|
return cleaned_citations
|
||||||
|
|
||||||
|
|
||||||
|
def get_cited_DOIs(file):
|
||||||
|
"""
|
||||||
|
Get the DOIs of the papers cited in a plaintext file. The file should \
|
||||||
|
have one citation per line.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This function is also used as a backend tool by most of the others \
|
||||||
|
citations processors, to factorize the code.
|
||||||
|
|
||||||
|
:param file: Either the path to the plaintext file or the content of a \
|
||||||
|
plaintext file. It can also be a parsed list of plaintext \
|
||||||
|
citations and, in this case, no preprocessing is done.
|
||||||
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||||
|
"""
|
||||||
|
# If file is not a pre-processed list of plaintext citations
|
||||||
|
if not isinstance(file, list):
|
||||||
|
# It is either a path to a plaintext file or the content of a plaintext
|
||||||
|
# file, we need some pre-processing to get a list of citations.
|
||||||
|
plaintext_citations = get_plaintext_citations(file)
|
||||||
|
dois = {}
|
||||||
|
crossref_queue = []
|
||||||
|
|
||||||
|
# Try to get the DOI directly from the citation
|
||||||
|
for citation in plaintext_citations[:]:
|
||||||
|
# Some citations already contain a DOI so try to match it directly
|
||||||
|
matched_DOIs = doi.extract_from_text(citation)
|
||||||
|
if matched_DOIs is not None:
|
||||||
|
# Add the DOI and go on
|
||||||
|
dois[citation] = matched_DOIs[0]
|
||||||
|
continue
|
||||||
|
# Same thing for arXiv id
|
||||||
|
matched_arXiv = arxiv.extract_from_text(citation)
|
||||||
|
if matched_arXiv is not None:
|
||||||
|
# Add the associated DOI and go on
|
||||||
|
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
|
||||||
|
continue
|
||||||
|
# If no match found, stack it for next step
|
||||||
|
# Note to remove URLs in the citation as the plaintext citations can
|
||||||
|
# contain URLs and they are bad for the CrossRef API.
|
||||||
|
crossref_queue.append(tools.remove_URLs(citation))
|
||||||
|
|
||||||
|
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
||||||
|
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||||
|
try:
|
||||||
|
# Fetch results from CrossRef
|
||||||
|
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||||
|
for result in r.json()["results"]:
|
||||||
|
# Try to get a DOI
|
||||||
|
try:
|
||||||
|
dois[result["text"]] = result["doi"]
|
||||||
|
except KeyError:
|
||||||
|
# Or set it to None
|
||||||
|
dois[result["text"]] = None
|
||||||
|
except (RequestException, ValueError, KeyError):
|
||||||
|
# If an exception occurred, set all the DOIs to None for the
|
||||||
|
# current batch
|
||||||
|
for i in batch:
|
||||||
|
dois[i] = None
|
||||||
|
return dois
|
Loading…
Reference in New Issue
Block a user