Plug arXiv

This commit is contained in:
Lucas Verney 2015-12-23 22:18:52 +01:00
parent a299cc7816
commit 985304446c
2 changed files with 161 additions and 16 deletions

View File

@ -13,4 +13,5 @@ For building `opendetex` (which is a necessary dependency), you will need
## Usage ## Usage
`./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`. * `./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
* `./main.py arxiv_eprint_id` to get a list of DOIs associated to each reference from the provided arXiv eprint.

174
main.py
View File

@ -1,10 +1,24 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import io
import math import math
import os import os
import re import re
import requests import requests
import subprocess import subprocess
import sys import sys
import tarfile
regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
regex_bibitems = re.compile(r"\\bibitem\{.+?\}")
regex_endthebibliography = re.compile(r"\\end\{thebibliography}")
def clean_whitespaces(text):
"""
Remove double whitespaces and trailing . and , from text.
"""
return ' '.join(text.strip().rstrip(".,").split())
def oa_from_doi(doi): def oa_from_doi(doi):
@ -18,34 +32,103 @@ def oa_from_doi(doi):
def clean_bibitem(bibitem): def clean_bibitem(bibitem):
""" """
Return a plaintext representation of the bibitem from the bbl file. Return a plaintext representation of the bibitem from the bbl file.
Params:
- bibitem is the text content of the bibitem.
Returns a cleaned plaintext citation from the bibitem.
""" """
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"], output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
"-s"],
input=bibitem.encode("utf-8")) input=bibitem.encode("utf-8"))
output = output.decode("utf-8") output = output.decode("utf-8")
output = ' '.join(output.strip().rstrip(".,").split()) output = clean_whitespaces(output)
return output return output
def parse_bbl(bbl_file): def parse_bbl(bbl):
with open(bbl_file, 'r') as fh: """
bbl_content = fh.read() Parse a *.bbl file to get a clean list of plaintext citations.
bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:]
bibitems = [re.sub(r"\\end\{thebibliography}", Params:
"", - bbl is either the path to the .bbl file or the content of a bbl file.
i).strip() for i in bibitems]
Returns a list of cleaned plaintext citations.
"""
# Handle path or content
if os.path.isfile(bbl):
with open(bbl, 'r') as fh:
bbl_content = fh.read()
else:
bbl_content = bbl
# Get a list of bibitems
bibitems = regex_bibitems.split(bbl_content)[1:]
bibitems = [regex_endthebibliography.sub("",
i).strip() for i in bibitems]
cleaned_bbl = [] cleaned_bbl = []
# Clean every bibitem
for bibitem in bibitems: for bibitem in bibitems:
cleaned_bbl.append(clean_bibitem(bibitem)) cleaned_bbl.append(clean_bibitem(bibitem))
return cleaned_bbl return cleaned_bbl
def dois_from_bbl(bbl_file): def extract_doi_links(urls):
"""
Try to find a DOI from a given list of URLs.
"""
doi_urls = [url for url in urls if "/doi/" in url]
if len(doi_urls) > 0:
return ("http://dx.doi.org" +
doi_urls[0][doi_urls[0].find("/doi/") + 4:])
else:
return None
def extract_arxiv_links(urls):
"""
Try to find an arXiv link from a given list of URLs.
"""
arxiv_urls = [url for url in urls if "://arxiv.org" in url]
if len(arxiv_urls) > 0:
return arxiv_urls[0]
else:
return None
def dois_from_bbl(bbl):
""" """
Get the papers cited by the paper identified by the given DOI. Get the papers cited by the paper identified by the given DOI.
Params:
- bbl is either the path to the .bbl file or the content of a bbl file.
Returns a dict of cleaned plaintext citations and their associated doi.
""" """
cleaned_citations = parse_bbl(bbl_file) cleaned_citations_with_URLs = parse_bbl(bbl)
dois = {} dois = {}
cleaned_citations = []
# Try to get the DOI directly from the citation
for citation in cleaned_citations_with_URLs[:]:
# Get all the urls in the citation
raw_urls = regex_urls.findall(citation)
urls = [u.lower() for u in raw_urls]
# Remove URLs in citation
for url in raw_urls:
citation = citation.replace(url, "")
citation = clean_whitespaces(citation)
# Try to find an arXiv link
arxiv_url = extract_arxiv_links(urls)
if arxiv_url:
dois[citation] = arxiv_url
# Try to find a DOI link
doi_url = extract_doi_links(urls)
if doi_url:
dois[citation] = doi_url
# If no match found, stack it for next step
if citation not in dois:
cleaned_citations.append(citation)
# Do batch of 10 papers, to prevent from the timeout of crossref
for i in range(math.ceil(len(cleaned_citations) / 10)): for i in range(math.ceil(len(cleaned_citations) / 10)):
lower_bound = 10 * i lower_bound = 10 * i
upper_bound = min(10 * (i + 1), len(cleaned_citations)) upper_bound = min(10 * (i + 1), len(cleaned_citations))
@ -53,15 +136,76 @@ def dois_from_bbl(bbl_file):
json=cleaned_citations[lower_bound:upper_bound]) json=cleaned_citations[lower_bound:upper_bound])
for result in r.json()["results"]: for result in r.json()["results"]:
if "doi" not in result: if "doi" not in result:
# If DOI is not found, try a direct query to get a DOI
# r = requests.get("http://search.crossref.org/dois",
# params={
# 'q': result["text"],
# "sort": "score",
# "rows": 1
# })
# doi_result = r.json()
# if len(doi_result) > 0:
# dois[result["text"]] = doi_result[0]["doi"]
# else:
# dois[result["text"]] = None
dois[result["text"]] = None dois[result["text"]] = None
else: else:
dois[result["text"]] = result["doi"] dois[result["text"]] = result["doi"]
return dois return dois
if __name__ == "__main__": def sources_from_arxiv(eprint):
if len(sys.argv) < 2: """
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.") Download sources on arXiv for a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a TarFile object of the sources of the arXiv preprint.
"""
r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
file_object = io.BytesIO(r.content)
return tarfile.open(fileobj=file_object)
def bbl_from_arxiv(eprint):
"""
Get the .bbl files (if any) of a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a list of the .bbl files as text (if any) or None.
"""
tf = sources_from_arxiv(eprint)
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
for member in bbl_files]
return bbl_files
def dois_from_arxiv(eprint):
"""
Get the .bbl files (if any) of a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a dict of cleaned plaintext citations and their associated doi.
"""
bbl_files = bbl_from_arxiv(eprint)
dois = {}
for bbl in bbl_files:
dois.update(dois_from_bbl(bbl))
return dois
if __name__ == "__main__":
import pprint import pprint
pprint.pprint(dois_from_bbl(sys.argv[1])) if len(sys.argv) < 2:
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
if os.path.isfile(sys.argv[1]):
pprint.pprint(dois_from_bbl(sys.argv[1]))
else:
pprint.pprint(dois_from_arxiv(sys.argv[1]))