libbmc/libbmc/citations/plaintext.py

106 lines
3.7 KiB
Python

"""
This files contains all the functions to extract DOIs of citations from
plaintext files.
# TODO: Unittests
"""
import os
import requests
from requests.exceptions import RequestException
from libbmc import doi
from libbmc import tools
from libbmc.repositories import arxiv
# CrossRef API URL
CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
CROSSREF_MAX_BATCH_SIZE = 10
def get_plaintext_citations(file):
"""
Parse a plaintext file to get a clean list of plaintext citations. The \
file should have one citation per line.
:param file: Either the path to the plaintext file or the content of a \
plaintext file.
:returns: A list of cleaned plaintext citations.
"""
# Handle path or content
if os.path.isfile(file):
with open(file, 'r') as fh:
content = fh.readlines()
else:
content = file.splitlines()
# Clean every line to have plaintext
cleaned_citations = [tools.clean_whitespaces(line) for line in content]
return cleaned_citations
def get_cited_DOIs(file):
"""
Get the DOIs of the papers cited in a plaintext file. The file should \
have one citation per line.
.. note::
This function is also used as a backend tool by most of the others \
citations processors, to factorize the code.
:param file: Either the path to the plaintext file or the content of a \
plaintext file. It can also be a parsed list of plaintext \
citations and, in this case, no preprocessing is done.
:returns: A dict of cleaned plaintext citations and their associated DOI.
"""
# If file is not a pre-processed list of plaintext citations
if not isinstance(file, list):
# It is either a path to a plaintext file or the content of a plaintext
# file, we need some pre-processing to get a list of citations.
plaintext_citations = get_plaintext_citations(file)
else:
# Else, we passed a list of plaintext citations.
plaintext_citations = file
dois = {}
crossref_queue = []
# Try to get the DOI directly from the citation
for citation in plaintext_citations[:]:
# Some citations already contain a DOI so try to match it directly
matched_DOIs = doi.extract_from_text(citation)
if len(matched_DOIs) > 0:
# Add the DOI and go on
dois[citation] = next(iter(matched_DOIs))
continue
# Same thing for arXiv id
matched_arXiv = arxiv.extract_from_text(citation)
if len(matched_arXiv) > 0:
# Add the associated DOI and go on
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
continue
# If no match found, stack it for next step
# Note to remove URLs in the citation as the plaintext citations can
# contain URLs and they are bad for the CrossRef API.
crossref_queue.append(tools.remove_URLs(citation))
# Do batch with remaining papers, to prevent from the timeout of CrossRef
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
batch = [i for i in batch]
try:
# Fetch results from CrossRef
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
for result in r.json()["results"]:
# Try to get a DOI
try:
dois[result["text"]] = result["doi"]
except KeyError:
# Or set it to None
dois[result["text"]] = None
except (RequestException, ValueError, KeyError):
# If an exception occurred, set all the DOIs to None for the
# current batch
for i in batch:
dois[i] = None
return dois