From 0d17254f6cd393fce7e79effcbc91a4df10ef045 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Mon, 28 Dec 2015 00:21:41 +0100 Subject: [PATCH] Add a citation fetcher for plaintext, and factorize code with bbl citations fetcher --- libbmc/citations/bbl.py | 61 ++++----------------- libbmc/citations/plaintext.py | 99 +++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 52 deletions(-) create mode 100644 libbmc/citations/plaintext.py diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py index 10aef3e..8de79ab 100644 --- a/libbmc/citations/bbl.py +++ b/libbmc/citations/bbl.py @@ -1,16 +1,13 @@ """ -This files contains all the functions to deal with .bbl files. +This files contains all the functions to extract DOIs of citations from .bbl +files. """ import os import re -import requests import subprocess -from requests.exception import RequestException - -from libbmc import doi from libbmc import tools -from libbmc.repositories import arxiv +from libbmc.citations import bbl # Regex to match bibitems @@ -19,11 +16,6 @@ BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}") ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*") -# CrossRef API URL -CROSSREF_LINKS_API_URL = "http://search.crossref.org/links" -CROSSREF_MAX_BATCH_SIZE = 10 - - def bibitem_as_plaintext(bibitem): """ Return a plaintext representation of the bibitem from the ``.bbl`` file. @@ -75,51 +67,16 @@ def get_plaintext_citations(bbl): return cleaned_bbl -def get_cited_DOIs(bbl): +def get_cited_DOIs(bbl_input): """ Get the DOIs of the papers cited in this .bbl file. - :param bbl: Either the path to a .bbl file or the content of a .bbl file. + :param bbl_input: Either the path to a .bbl file or the content \ + of a .bbl file. :returns: A dict of cleaned plaintext citations and their associated DOI. """ - dois = {} - crossref_queue = [] # Get the plaintext citations from the bbl file - plaintext_citations = get_plaintext_citations(bbl) - # Try to get the DOI directly from the citation - for citation in plaintext_citations[:]: - # Some citations already contain a DOI so try to match it directly - matched_DOIs = doi.extract_from_text(citation) - if matched_DOIs is not None: - # Add the DOI and go on - dois[citation] = matched_DOIs[0] - continue - # Same thing for arXiv id - matched_arXiv = arxiv.extract_from_text(citation) - if matched_arXiv is not None: - # Add the associated DOI and go on - dois[citation] = arxiv.to_DOI(matched_arXiv[0]) - continue - # If no match found, stack it for next step - # Note to remove URLs in the citation as the plaintext citations can - # contain URLs and they are bad for the CrossRef API. - crossref_queue.append(tools.remove_URLs(citation)) - # Do batch of papers, to prevent from the timeout of crossref - for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): - try: - # Fetch results from CrossRef - r = requests.post(CROSSREF_LINKS_API_URL, json=batch) - for result in r.json()["results"]: - # Try to get a DOI - try: - dois[result["text"]] = result["doi"] - except KeyError: - # Or set it to None - dois[result["text"]] = None - except (RequestException, ValueError, KeyError): - # If an exception occurred, set all the DOIs to None for the - # current batch - for i in batch: - dois[i] = None - return dois + plaintext_citations = get_plaintext_citations(bbl_input) + # Use the plaintext citations parser on these citations + return bbl.get_cited_DOIs(plaintext_citations) diff --git a/libbmc/citations/plaintext.py b/libbmc/citations/plaintext.py new file mode 100644 index 0000000..58ea18c --- /dev/null +++ b/libbmc/citations/plaintext.py @@ -0,0 +1,99 @@ +""" +This files contains all the functions to extract DOIs of citations from +plaintext files. +""" +import os +import requests + +from requests.exception import RequestException + +from libbmc import doi +from libbmc import tools +from libbmc.repositories import arxiv + + +# CrossRef API URL +CROSSREF_LINKS_API_URL = "http://search.crossref.org/links" +CROSSREF_MAX_BATCH_SIZE = 10 + + +def get_plaintext_citations(file): + """ + Parse a plaintext file to get a clean list of plaintext citations. The \ + file should have one citation per line. + + :param file: Either the path to the plaintext file or the content of a \ + plaintext file. + :returns: A list of cleaned plaintext citations. + """ + # Handle path or content + if os.path.isfile(file): + with open(file, 'r') as fh: + content = fh.readlines() + else: + content = file.splitlines() + # Clean every line to have plaintext + cleaned_citations = [tools.clean_whitespaces(line) for line in content] + return cleaned_citations + + +def get_cited_DOIs(file): + """ + Get the DOIs of the papers cited in a plaintext file. The file should \ + have one citation per line. + + .. note:: + + This function is also used as a backend tool by most of the others \ + citations processors, to factorize the code. + + :param file: Either the path to the plaintext file or the content of a \ + plaintext file. It can also be a parsed list of plaintext \ + citations and, in this case, no preprocessing is done. + :returns: A dict of cleaned plaintext citations and their associated DOI. + """ + # If file is not a pre-processed list of plaintext citations + if not isinstance(file, list): + # It is either a path to a plaintext file or the content of a plaintext + # file, we need some pre-processing to get a list of citations. + plaintext_citations = get_plaintext_citations(file) + dois = {} + crossref_queue = [] + + # Try to get the DOI directly from the citation + for citation in plaintext_citations[:]: + # Some citations already contain a DOI so try to match it directly + matched_DOIs = doi.extract_from_text(citation) + if matched_DOIs is not None: + # Add the DOI and go on + dois[citation] = matched_DOIs[0] + continue + # Same thing for arXiv id + matched_arXiv = arxiv.extract_from_text(citation) + if matched_arXiv is not None: + # Add the associated DOI and go on + dois[citation] = arxiv.to_DOI(matched_arXiv[0]) + continue + # If no match found, stack it for next step + # Note to remove URLs in the citation as the plaintext citations can + # contain URLs and they are bad for the CrossRef API. + crossref_queue.append(tools.remove_URLs(citation)) + + # Do batch with remaining papers, to prevent from the timeout of CrossRef + for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): + try: + # Fetch results from CrossRef + r = requests.post(CROSSREF_LINKS_API_URL, json=batch) + for result in r.json()["results"]: + # Try to get a DOI + try: + dois[result["text"]] = result["doi"] + except KeyError: + # Or set it to None + dois[result["text"]] = None + except (RequestException, ValueError, KeyError): + # If an exception occurred, set all the DOIs to None for the + # current batch + for i in batch: + dois[i] = None + return dois