Plug arXiv
This commit is contained in:
parent
a299cc7816
commit
985304446c
@ -13,4 +13,5 @@ For building `opendetex` (which is a necessary dependency), you will need
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
`./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
|
* `./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
|
||||||
|
* `./main.py arxiv_eprint_id` to get a list of DOIs associated to each reference from the provided arXiv eprint.
|
||||||
|
168
main.py
168
main.py
@ -1,10 +1,24 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
import io
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tarfile
|
||||||
|
|
||||||
|
|
||||||
|
regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
|
||||||
|
regex_bibitems = re.compile(r"\\bibitem\{.+?\}")
|
||||||
|
regex_endthebibliography = re.compile(r"\\end\{thebibliography}")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_whitespaces(text):
|
||||||
|
"""
|
||||||
|
Remove double whitespaces and trailing . and , from text.
|
||||||
|
"""
|
||||||
|
return ' '.join(text.strip().rstrip(".,").split())
|
||||||
|
|
||||||
|
|
||||||
def oa_from_doi(doi):
|
def oa_from_doi(doi):
|
||||||
@ -18,34 +32,103 @@ def oa_from_doi(doi):
|
|||||||
def clean_bibitem(bibitem):
|
def clean_bibitem(bibitem):
|
||||||
"""
|
"""
|
||||||
Return a plaintext representation of the bibitem from the bbl file.
|
Return a plaintext representation of the bibitem from the bbl file.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- bibitem is the text content of the bibitem.
|
||||||
|
|
||||||
|
Returns a cleaned plaintext citation from the bibitem.
|
||||||
"""
|
"""
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"],
|
output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
|
||||||
|
"-s"],
|
||||||
input=bibitem.encode("utf-8"))
|
input=bibitem.encode("utf-8"))
|
||||||
output = output.decode("utf-8")
|
output = output.decode("utf-8")
|
||||||
output = ' '.join(output.strip().rstrip(".,").split())
|
output = clean_whitespaces(output)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def parse_bbl(bbl_file):
|
def parse_bbl(bbl):
|
||||||
with open(bbl_file, 'r') as fh:
|
"""
|
||||||
|
Parse a *.bbl file to get a clean list of plaintext citations.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- bbl is either the path to the .bbl file or the content of a bbl file.
|
||||||
|
|
||||||
|
Returns a list of cleaned plaintext citations.
|
||||||
|
"""
|
||||||
|
# Handle path or content
|
||||||
|
if os.path.isfile(bbl):
|
||||||
|
with open(bbl, 'r') as fh:
|
||||||
bbl_content = fh.read()
|
bbl_content = fh.read()
|
||||||
bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:]
|
else:
|
||||||
bibitems = [re.sub(r"\\end\{thebibliography}",
|
bbl_content = bbl
|
||||||
"",
|
# Get a list of bibitems
|
||||||
|
bibitems = regex_bibitems.split(bbl_content)[1:]
|
||||||
|
bibitems = [regex_endthebibliography.sub("",
|
||||||
i).strip() for i in bibitems]
|
i).strip() for i in bibitems]
|
||||||
cleaned_bbl = []
|
cleaned_bbl = []
|
||||||
|
# Clean every bibitem
|
||||||
for bibitem in bibitems:
|
for bibitem in bibitems:
|
||||||
cleaned_bbl.append(clean_bibitem(bibitem))
|
cleaned_bbl.append(clean_bibitem(bibitem))
|
||||||
return cleaned_bbl
|
return cleaned_bbl
|
||||||
|
|
||||||
|
|
||||||
def dois_from_bbl(bbl_file):
|
def extract_doi_links(urls):
|
||||||
|
"""
|
||||||
|
Try to find a DOI from a given list of URLs.
|
||||||
|
"""
|
||||||
|
doi_urls = [url for url in urls if "/doi/" in url]
|
||||||
|
if len(doi_urls) > 0:
|
||||||
|
return ("http://dx.doi.org" +
|
||||||
|
doi_urls[0][doi_urls[0].find("/doi/") + 4:])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_arxiv_links(urls):
|
||||||
|
"""
|
||||||
|
Try to find an arXiv link from a given list of URLs.
|
||||||
|
"""
|
||||||
|
arxiv_urls = [url for url in urls if "://arxiv.org" in url]
|
||||||
|
if len(arxiv_urls) > 0:
|
||||||
|
return arxiv_urls[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def dois_from_bbl(bbl):
|
||||||
"""
|
"""
|
||||||
Get the papers cited by the paper identified by the given DOI.
|
Get the papers cited by the paper identified by the given DOI.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- bbl is either the path to the .bbl file or the content of a bbl file.
|
||||||
|
|
||||||
|
Returns a dict of cleaned plaintext citations and their associated doi.
|
||||||
"""
|
"""
|
||||||
cleaned_citations = parse_bbl(bbl_file)
|
cleaned_citations_with_URLs = parse_bbl(bbl)
|
||||||
dois = {}
|
dois = {}
|
||||||
|
cleaned_citations = []
|
||||||
|
# Try to get the DOI directly from the citation
|
||||||
|
for citation in cleaned_citations_with_URLs[:]:
|
||||||
|
# Get all the urls in the citation
|
||||||
|
raw_urls = regex_urls.findall(citation)
|
||||||
|
urls = [u.lower() for u in raw_urls]
|
||||||
|
# Remove URLs in citation
|
||||||
|
for url in raw_urls:
|
||||||
|
citation = citation.replace(url, "")
|
||||||
|
citation = clean_whitespaces(citation)
|
||||||
|
# Try to find an arXiv link
|
||||||
|
arxiv_url = extract_arxiv_links(urls)
|
||||||
|
if arxiv_url:
|
||||||
|
dois[citation] = arxiv_url
|
||||||
|
# Try to find a DOI link
|
||||||
|
doi_url = extract_doi_links(urls)
|
||||||
|
if doi_url:
|
||||||
|
dois[citation] = doi_url
|
||||||
|
# If no match found, stack it for next step
|
||||||
|
if citation not in dois:
|
||||||
|
cleaned_citations.append(citation)
|
||||||
|
# Do batch of 10 papers, to prevent from the timeout of crossref
|
||||||
for i in range(math.ceil(len(cleaned_citations) / 10)):
|
for i in range(math.ceil(len(cleaned_citations) / 10)):
|
||||||
lower_bound = 10 * i
|
lower_bound = 10 * i
|
||||||
upper_bound = min(10 * (i + 1), len(cleaned_citations))
|
upper_bound = min(10 * (i + 1), len(cleaned_citations))
|
||||||
@ -53,15 +136,76 @@ def dois_from_bbl(bbl_file):
|
|||||||
json=cleaned_citations[lower_bound:upper_bound])
|
json=cleaned_citations[lower_bound:upper_bound])
|
||||||
for result in r.json()["results"]:
|
for result in r.json()["results"]:
|
||||||
if "doi" not in result:
|
if "doi" not in result:
|
||||||
|
# If DOI is not found, try a direct query to get a DOI
|
||||||
|
# r = requests.get("http://search.crossref.org/dois",
|
||||||
|
# params={
|
||||||
|
# 'q': result["text"],
|
||||||
|
# "sort": "score",
|
||||||
|
# "rows": 1
|
||||||
|
# })
|
||||||
|
# doi_result = r.json()
|
||||||
|
# if len(doi_result) > 0:
|
||||||
|
# dois[result["text"]] = doi_result[0]["doi"]
|
||||||
|
# else:
|
||||||
|
# dois[result["text"]] = None
|
||||||
dois[result["text"]] = None
|
dois[result["text"]] = None
|
||||||
else:
|
else:
|
||||||
dois[result["text"]] = result["doi"]
|
dois[result["text"]] = result["doi"]
|
||||||
return dois
|
return dois
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def sources_from_arxiv(eprint):
|
||||||
if len(sys.argv) < 2:
|
"""
|
||||||
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.")
|
Download sources on arXiv for a given preprint.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
||||||
|
|
||||||
|
Returns a TarFile object of the sources of the arXiv preprint.
|
||||||
|
"""
|
||||||
|
r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
|
||||||
|
file_object = io.BytesIO(r.content)
|
||||||
|
return tarfile.open(fileobj=file_object)
|
||||||
|
|
||||||
|
|
||||||
|
def bbl_from_arxiv(eprint):
|
||||||
|
"""
|
||||||
|
Get the .bbl files (if any) of a given preprint.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
||||||
|
|
||||||
|
Returns a list of the .bbl files as text (if any) or None.
|
||||||
|
"""
|
||||||
|
tf = sources_from_arxiv(eprint)
|
||||||
|
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
||||||
|
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
||||||
|
for member in bbl_files]
|
||||||
|
return bbl_files
|
||||||
|
|
||||||
|
|
||||||
|
def dois_from_arxiv(eprint):
|
||||||
|
"""
|
||||||
|
Get the .bbl files (if any) of a given preprint.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
||||||
|
|
||||||
|
Returns a dict of cleaned plaintext citations and their associated doi.
|
||||||
|
"""
|
||||||
|
bbl_files = bbl_from_arxiv(eprint)
|
||||||
|
dois = {}
|
||||||
|
for bbl in bbl_files:
|
||||||
|
dois.update(dois_from_bbl(bbl))
|
||||||
|
return dois
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
import pprint
|
import pprint
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
|
||||||
|
|
||||||
|
if os.path.isfile(sys.argv[1]):
|
||||||
pprint.pprint(dois_from_bbl(sys.argv[1]))
|
pprint.pprint(dois_from_bbl(sys.argv[1]))
|
||||||
|
else:
|
||||||
|
pprint.pprint(dois_from_arxiv(sys.argv[1]))
|
||||||
|
Loading…
Reference in New Issue
Block a user