From 985304446cf7da26b2dee8a43029f2964554df2e Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Wed, 23 Dec 2015 22:18:52 +0100 Subject: [PATCH] Plug arXiv --- README.md | 3 +- main.py | 174 +++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 161 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 9fc9865..e44cdff 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,5 @@ For building `opendetex` (which is a necessary dependency), you will need ## Usage -`./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`. +* `./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`. +* `./main.py arxiv_eprint_id` to get a list of DOIs associated to each reference from the provided arXiv eprint. diff --git a/main.py b/main.py index 94914f4..1e2613a 100755 --- a/main.py +++ b/main.py @@ -1,10 +1,24 @@ #!/usr/bin/env python3 +import io import math import os import re import requests import subprocess import sys +import tarfile + + +regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") +regex_bibitems = re.compile(r"\\bibitem\{.+?\}") +regex_endthebibliography = re.compile(r"\\end\{thebibliography}") + + +def clean_whitespaces(text): + """ + Remove double whitespaces and trailing . and , from text. + """ + return ' '.join(text.strip().rstrip(".,").split()) def oa_from_doi(doi): @@ -18,34 +32,103 @@ def oa_from_doi(doi): def clean_bibitem(bibitem): """ Return a plaintext representation of the bibitem from the bbl file. + + Params: + - bibitem is the text content of the bibitem. + + Returns a cleaned plaintext citation from the bibitem. """ script_dir = os.path.dirname(os.path.abspath(__file__)) - output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"], + output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,), + "-s"], input=bibitem.encode("utf-8")) output = output.decode("utf-8") - output = ' '.join(output.strip().rstrip(".,").split()) + output = clean_whitespaces(output) return output -def parse_bbl(bbl_file): - with open(bbl_file, 'r') as fh: - bbl_content = fh.read() - bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:] - bibitems = [re.sub(r"\\end\{thebibliography}", - "", - i).strip() for i in bibitems] +def parse_bbl(bbl): + """ + Parse a *.bbl file to get a clean list of plaintext citations. + + Params: + - bbl is either the path to the .bbl file or the content of a bbl file. + + Returns a list of cleaned plaintext citations. + """ + # Handle path or content + if os.path.isfile(bbl): + with open(bbl, 'r') as fh: + bbl_content = fh.read() + else: + bbl_content = bbl + # Get a list of bibitems + bibitems = regex_bibitems.split(bbl_content)[1:] + bibitems = [regex_endthebibliography.sub("", + i).strip() for i in bibitems] cleaned_bbl = [] + # Clean every bibitem for bibitem in bibitems: cleaned_bbl.append(clean_bibitem(bibitem)) return cleaned_bbl -def dois_from_bbl(bbl_file): +def extract_doi_links(urls): + """ + Try to find a DOI from a given list of URLs. + """ + doi_urls = [url for url in urls if "/doi/" in url] + if len(doi_urls) > 0: + return ("http://dx.doi.org" + + doi_urls[0][doi_urls[0].find("/doi/") + 4:]) + else: + return None + + +def extract_arxiv_links(urls): + """ + Try to find an arXiv link from a given list of URLs. + """ + arxiv_urls = [url for url in urls if "://arxiv.org" in url] + if len(arxiv_urls) > 0: + return arxiv_urls[0] + else: + return None + + +def dois_from_bbl(bbl): """ Get the papers cited by the paper identified by the given DOI. + + Params: + - bbl is either the path to the .bbl file or the content of a bbl file. + + Returns a dict of cleaned plaintext citations and their associated doi. """ - cleaned_citations = parse_bbl(bbl_file) + cleaned_citations_with_URLs = parse_bbl(bbl) dois = {} + cleaned_citations = [] + # Try to get the DOI directly from the citation + for citation in cleaned_citations_with_URLs[:]: + # Get all the urls in the citation + raw_urls = regex_urls.findall(citation) + urls = [u.lower() for u in raw_urls] + # Remove URLs in citation + for url in raw_urls: + citation = citation.replace(url, "") + citation = clean_whitespaces(citation) + # Try to find an arXiv link + arxiv_url = extract_arxiv_links(urls) + if arxiv_url: + dois[citation] = arxiv_url + # Try to find a DOI link + doi_url = extract_doi_links(urls) + if doi_url: + dois[citation] = doi_url + # If no match found, stack it for next step + if citation not in dois: + cleaned_citations.append(citation) + # Do batch of 10 papers, to prevent from the timeout of crossref for i in range(math.ceil(len(cleaned_citations) / 10)): lower_bound = 10 * i upper_bound = min(10 * (i + 1), len(cleaned_citations)) @@ -53,15 +136,76 @@ def dois_from_bbl(bbl_file): json=cleaned_citations[lower_bound:upper_bound]) for result in r.json()["results"]: if "doi" not in result: + # If DOI is not found, try a direct query to get a DOI + # r = requests.get("http://search.crossref.org/dois", + # params={ + # 'q': result["text"], + # "sort": "score", + # "rows": 1 + # }) + # doi_result = r.json() + # if len(doi_result) > 0: + # dois[result["text"]] = doi_result[0]["doi"] + # else: + # dois[result["text"]] = None dois[result["text"]] = None else: dois[result["text"]] = result["doi"] return dois -if __name__ == "__main__": - if len(sys.argv) < 2: - sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.") +def sources_from_arxiv(eprint): + """ + Download sources on arXiv for a given preprint. + Params: + - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). + + Returns a TarFile object of the sources of the arXiv preprint. + """ + r = requests.get("http://arxiv.org/e-print/%s" % (eprint,)) + file_object = io.BytesIO(r.content) + return tarfile.open(fileobj=file_object) + + +def bbl_from_arxiv(eprint): + """ + Get the .bbl files (if any) of a given preprint. + + Params: + - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). + + Returns a list of the .bbl files as text (if any) or None. + """ + tf = sources_from_arxiv(eprint) + bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")] + bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING) + for member in bbl_files] + return bbl_files + + +def dois_from_arxiv(eprint): + """ + Get the .bbl files (if any) of a given preprint. + + Params: + - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). + + Returns a dict of cleaned plaintext citations and their associated doi. + """ + bbl_files = bbl_from_arxiv(eprint) + dois = {} + for bbl in bbl_files: + dois.update(dois_from_bbl(bbl)) + return dois + + +if __name__ == "__main__": import pprint - pprint.pprint(dois_from_bbl(sys.argv[1])) + if len(sys.argv) < 2: + sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.") + + if os.path.isfile(sys.argv[1]): + pprint.pprint(dois_from_bbl(sys.argv[1])) + else: + pprint.pprint(dois_from_arxiv(sys.argv[1]))