""" This files contains all the functions to extract DOIs of citations from plaintext files. # TODO: Unittests """ import os import requests from requests.exceptions import RequestException from libbmc import doi from libbmc import tools from libbmc.repositories import arxiv # CrossRef API URL CROSSREF_LINKS_API_URL = "http://search.crossref.org/links" CROSSREF_MAX_BATCH_SIZE = 10 def get_plaintext_citations(file): """ Parse a plaintext file to get a clean list of plaintext citations. The \ file should have one citation per line. :param file: Either the path to the plaintext file or the content of a \ plaintext file. :returns: A list of cleaned plaintext citations. """ # Handle path or content if os.path.isfile(file): with open(file, 'r') as fh: content = fh.readlines() else: content = file.splitlines() # Clean every line to have plaintext cleaned_citations = [tools.clean_whitespaces(line) for line in content] return cleaned_citations def get_cited_DOIs(file): """ Get the DOIs of the papers cited in a plaintext file. The file should \ have one citation per line. .. note:: This function is also used as a backend tool by most of the others \ citations processors, to factorize the code. :param file: Either the path to the plaintext file or the content of a \ plaintext file. It can also be a parsed list of plaintext \ citations and, in this case, no preprocessing is done. :returns: A dict of cleaned plaintext citations and their associated DOI. """ # If file is not a pre-processed list of plaintext citations if not isinstance(file, list): # It is either a path to a plaintext file or the content of a plaintext # file, we need some pre-processing to get a list of citations. plaintext_citations = get_plaintext_citations(file) else: # Else, we passed a list of plaintext citations. plaintext_citations = file dois = {} crossref_queue = [] # Try to get the DOI directly from the citation for citation in plaintext_citations[:]: # Some citations already contain a DOI so try to match it directly matched_DOIs = doi.extract_from_text(citation) if len(matched_DOIs) > 0: # Add the DOI and go on dois[citation] = next(iter(matched_DOIs)) continue # Same thing for arXiv id matched_arXiv = arxiv.extract_from_text(citation) if len(matched_arXiv) > 0: # Add the associated DOI and go on dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv))) continue # If no match found, stack it for next step # Note to remove URLs in the citation as the plaintext citations can # contain URLs and they are bad for the CrossRef API. crossref_queue.append(tools.remove_URLs(citation)) # Do batch with remaining papers, to prevent from the timeout of CrossRef for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): batch = [i for i in batch] try: # Fetch results from CrossRef r = requests.post(CROSSREF_LINKS_API_URL, json=batch) for result in r.json()["results"]: # Try to get a DOI try: dois[result["text"]] = result["doi"] except KeyError: # Or set it to None dois[result["text"]] = None except (RequestException, ValueError, KeyError): # If an exception occurred, set all the DOIs to None for the # current batch for i in batch: dois[i] = None return dois