Clean a bit the code
This commit is contained in:
parent
97b90ee610
commit
51df30918a
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
__pycache__
|
50
arxiv.py
Normal file
50
arxiv.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
import bbl
|
||||||
|
import io
|
||||||
|
import requests
|
||||||
|
import tarfile
|
||||||
|
|
||||||
|
|
||||||
|
def sources_from_arxiv(eprint):
|
||||||
|
"""
|
||||||
|
Download sources on arXiv for a given preprint.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
||||||
|
|
||||||
|
Returns a TarFile object of the sources of the arXiv preprint.
|
||||||
|
"""
|
||||||
|
r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
|
||||||
|
file_object = io.BytesIO(r.content)
|
||||||
|
return tarfile.open(fileobj=file_object)
|
||||||
|
|
||||||
|
|
||||||
|
def bbl_from_arxiv(eprint):
|
||||||
|
"""
|
||||||
|
Get the .bbl files (if any) of a given preprint.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
||||||
|
|
||||||
|
Returns a list of the .bbl files as text (if any) or None.
|
||||||
|
"""
|
||||||
|
tf = sources_from_arxiv(eprint)
|
||||||
|
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
||||||
|
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
||||||
|
for member in bbl_files]
|
||||||
|
return bbl_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_dois(eprint):
|
||||||
|
"""
|
||||||
|
Get the .bbl files (if any) of a given preprint.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
||||||
|
|
||||||
|
Returns a dict of cleaned plaintext citations and their associated doi.
|
||||||
|
"""
|
||||||
|
bbl_files = bbl_from_arxiv(eprint)
|
||||||
|
dois = {}
|
||||||
|
for bbl_file in bbl_files:
|
||||||
|
dois.update(bbl.get_dois(bbl_file))
|
||||||
|
return dois
|
124
bbl.py
Normal file
124
bbl.py
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
import doi
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import regex
|
||||||
|
import tools
|
||||||
|
|
||||||
|
|
||||||
|
def clean_bibitem(bibitem):
|
||||||
|
"""
|
||||||
|
Return a plaintext representation of the bibitem from the bbl file.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- bibitem is the text content of the bibitem.
|
||||||
|
|
||||||
|
Returns a cleaned plaintext citation from the bibitem.
|
||||||
|
"""
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
|
||||||
|
"-s"],
|
||||||
|
input=bibitem.encode("utf-8"))
|
||||||
|
output = output.decode("utf-8")
|
||||||
|
output = tools.clean_whitespaces(output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def parse(bbl):
|
||||||
|
"""
|
||||||
|
Parse a *.bbl file to get a clean list of plaintext citations.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- bbl is either the path to the .bbl file or the content of a bbl file.
|
||||||
|
|
||||||
|
Returns a list of cleaned plaintext citations.
|
||||||
|
"""
|
||||||
|
# Handle path or content
|
||||||
|
if os.path.isfile(bbl):
|
||||||
|
with open(bbl, 'r') as fh:
|
||||||
|
bbl_content = fh.read()
|
||||||
|
else:
|
||||||
|
bbl_content = bbl
|
||||||
|
# Get a list of bibitems
|
||||||
|
bibitems = regex.bibitems.split(bbl_content)[1:]
|
||||||
|
bibitems = [regex.endthebibliography.sub("",
|
||||||
|
i).strip() for i in bibitems]
|
||||||
|
cleaned_bbl = []
|
||||||
|
# Clean every bibitem
|
||||||
|
for bibitem in bibitems:
|
||||||
|
cleaned_bbl.append(clean_bibitem(bibitem))
|
||||||
|
return cleaned_bbl
|
||||||
|
|
||||||
|
|
||||||
|
def get_dois(bbl_input):
|
||||||
|
"""
|
||||||
|
Get the papers cited by the paper identified by the given DOI.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
- bbl_input is either the path to the .bbl file or the content of a bbl
|
||||||
|
file.
|
||||||
|
|
||||||
|
Returns a dict of cleaned plaintext citations and their associated doi.
|
||||||
|
"""
|
||||||
|
cleaned_citations_with_URLs = parse(bbl_input)
|
||||||
|
dois = {}
|
||||||
|
cleaned_citations = []
|
||||||
|
# Try to get the DOI directly from the citation
|
||||||
|
for citation in cleaned_citations_with_URLs[:]:
|
||||||
|
# Get all the urls in the citation
|
||||||
|
raw_urls = regex.urls.findall(citation)
|
||||||
|
urls = [u.lower() for u in raw_urls]
|
||||||
|
# Remove URLs in citation
|
||||||
|
for url in raw_urls:
|
||||||
|
citation = citation.replace(url, "")
|
||||||
|
citation = tools.clean_whitespaces(citation)
|
||||||
|
# Try to find an arXiv link
|
||||||
|
arxiv_url = doi.extract_arxiv_links(urls)
|
||||||
|
if arxiv_url:
|
||||||
|
dois[citation] = arxiv_url
|
||||||
|
# Try to find a DOI link
|
||||||
|
doi_url = doi.extract_doi_links(urls)
|
||||||
|
if doi_url:
|
||||||
|
dois[citation] = doi_url
|
||||||
|
# Try to find a direct match using a regex if links search failed
|
||||||
|
if not doi_url and not arxiv_url:
|
||||||
|
regex.match = doi.match_doi_or_arxiv(citation)
|
||||||
|
if regex.match:
|
||||||
|
print(regex.match)
|
||||||
|
citation = citation.replace(regex.match[1], "")
|
||||||
|
if regex.match[0] == "DOI":
|
||||||
|
dois[citation] = "http://dx.doi.org/%s" % (regex.match[1],)
|
||||||
|
else:
|
||||||
|
dois[citation] = (
|
||||||
|
"http://arxiv.org/abs/%s" %
|
||||||
|
(regex.match[1].replace("arxiv:", ""),)
|
||||||
|
)
|
||||||
|
# If no match found, stack it for next step
|
||||||
|
if citation not in dois:
|
||||||
|
cleaned_citations.append(citation)
|
||||||
|
# Do batch of 10 papers, to prevent from the timeout of crossref
|
||||||
|
for i in range(math.ceil(len(cleaned_citations) / 10)):
|
||||||
|
lower_bound = 10 * i
|
||||||
|
upper_bound = min(10 * (i + 1), len(cleaned_citations))
|
||||||
|
r = requests.post("http://search.crossref.org/links",
|
||||||
|
json=cleaned_citations[lower_bound:upper_bound])
|
||||||
|
for result in r.json()["results"]:
|
||||||
|
if "doi" not in result:
|
||||||
|
# If DOI is not found, try a direct query to get a DOI
|
||||||
|
# r = requests.get("http://search.crossref.org/dois",
|
||||||
|
# params={
|
||||||
|
# 'q': result["text"],
|
||||||
|
# "sort": "score",
|
||||||
|
# "rows": 1
|
||||||
|
# })
|
||||||
|
# doi_result = r.json()
|
||||||
|
# if len(doi_result) > 0:
|
||||||
|
# dois[result["text"]] = doi_result[0]["doi"]
|
||||||
|
# else:
|
||||||
|
# dois[result["text"]] = None
|
||||||
|
dois[result["text"]] = None
|
||||||
|
else:
|
||||||
|
dois[result["text"]] = result["doi"]
|
||||||
|
return dois
|
77
doi.py
Normal file
77
doi.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import regex
|
||||||
|
import tools
|
||||||
|
|
||||||
|
|
||||||
|
def extract_doi_links(urls):
|
||||||
|
"""
|
||||||
|
Try to find a DOI from a given list of URLs.
|
||||||
|
"""
|
||||||
|
doi_urls = [url for url in urls if "/doi/" in url]
|
||||||
|
if len(doi_urls) > 0:
|
||||||
|
return ("http://dx.doi.org" +
|
||||||
|
doi_urls[0][doi_urls[0].find("/doi/") + 4:])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_arxiv_links(urls):
|
||||||
|
"""
|
||||||
|
Try to find an arXiv link from a given list of URLs.
|
||||||
|
"""
|
||||||
|
arxiv_urls = [url for url in urls if "://arxiv.org" in url]
|
||||||
|
if len(arxiv_urls) > 0:
|
||||||
|
return arxiv_urls[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
|
||||||
|
"""
|
||||||
|
Search for a valid article ID (DOI or ArXiv) in the given text
|
||||||
|
(regex-based).
|
||||||
|
|
||||||
|
Returns a tuple (type, first matching ID) or None if not found.
|
||||||
|
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
||||||
|
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
||||||
|
"""
|
||||||
|
text = text.lower()
|
||||||
|
# Try to extract DOI
|
||||||
|
if "DOI" in only:
|
||||||
|
extractID = regex.doi.search(text.replace('Œ', '-'))
|
||||||
|
if not extractID:
|
||||||
|
# PNAS fix
|
||||||
|
extractID = regex.doi_pnas.search(text.
|
||||||
|
replace('pnas', '/pnas'))
|
||||||
|
if not extractID:
|
||||||
|
# JSB fix
|
||||||
|
extractID = regex.doi_jsb.search(text)
|
||||||
|
if extractID:
|
||||||
|
# If DOI extracted, clean it and return it
|
||||||
|
cleanDOI = False
|
||||||
|
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
|
||||||
|
if regex.clean_doi.search(cleanDOI):
|
||||||
|
cleanDOI = cleanDOI[1:]
|
||||||
|
# FABSE J fix
|
||||||
|
if regex.clean_doi_fabse.search(cleanDOI):
|
||||||
|
cleanDOI = cleanDOI[:20]
|
||||||
|
# Second JCB fix
|
||||||
|
if regex.clean_doi_jcb.search(cleanDOI):
|
||||||
|
cleanDOI = cleanDOI[:21]
|
||||||
|
if len(cleanDOI) > 40:
|
||||||
|
cleanDOItemp = regex.clean_doi_len.sub('000', cleanDOI)
|
||||||
|
reps = {'.': 'A', '-': '0'}
|
||||||
|
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
||||||
|
digitStart = 0
|
||||||
|
for i in range(len(cleanDOItemp)):
|
||||||
|
if cleanDOItemp[i].isdigit():
|
||||||
|
digitStart = 1
|
||||||
|
if cleanDOItemp[i].isalpha() and digitStart:
|
||||||
|
break
|
||||||
|
cleanDOI = cleanDOI[0:(8+i)]
|
||||||
|
return ("DOI", cleanDOI)
|
||||||
|
# Else, try to extract arXiv
|
||||||
|
if "arXiv" in only:
|
||||||
|
extractID = regex.arXiv.search(text)
|
||||||
|
if extractID:
|
||||||
|
return ("arXiv", extractID.group(1))
|
||||||
|
return None
|
276
main.py
276
main.py
@ -1,40 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import io
|
|
||||||
import math
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import requests
|
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
import tarfile
|
|
||||||
|
|
||||||
|
# Local import
|
||||||
regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
|
import arxiv
|
||||||
regex_bibitems = re.compile(r"\\bibitem\{.+?\}")
|
import bbl
|
||||||
regex_endthebibliography = re.compile(r"\\end\{thebibliography}")
|
|
||||||
|
|
||||||
regex_doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
|
|
||||||
regex_doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
|
|
||||||
regex_doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
|
|
||||||
regex_clean_doi = re.compile('^/')
|
|
||||||
regex_clean_doi_fabse = re.compile('^10.1096')
|
|
||||||
regex_clean_doi_jcb = re.compile('^10.1083')
|
|
||||||
regex_clean_doi_len = re.compile(r'\d\.\d')
|
|
||||||
regex_arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def replaceAll(text, dic):
|
|
||||||
"""Replace all the dic keys by the associated item in text"""
|
|
||||||
for i, j in dic.items():
|
|
||||||
text = text.replace(i, j)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def clean_whitespaces(text):
|
|
||||||
"""
|
|
||||||
Remove double whitespaces and trailing . and , from text.
|
|
||||||
"""
|
|
||||||
return ' '.join(text.strip().rstrip(".,").split())
|
|
||||||
|
|
||||||
|
|
||||||
def oa_from_doi(doi):
|
def oa_from_doi(doi):
|
||||||
@ -45,248 +15,12 @@ def oa_from_doi(doi):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def clean_bibitem(bibitem):
|
|
||||||
"""
|
|
||||||
Return a plaintext representation of the bibitem from the bbl file.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
- bibitem is the text content of the bibitem.
|
|
||||||
|
|
||||||
Returns a cleaned plaintext citation from the bibitem.
|
|
||||||
"""
|
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
|
|
||||||
"-s"],
|
|
||||||
input=bibitem.encode("utf-8"))
|
|
||||||
output = output.decode("utf-8")
|
|
||||||
output = clean_whitespaces(output)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def parse_bbl(bbl):
|
|
||||||
"""
|
|
||||||
Parse a *.bbl file to get a clean list of plaintext citations.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
- bbl is either the path to the .bbl file or the content of a bbl file.
|
|
||||||
|
|
||||||
Returns a list of cleaned plaintext citations.
|
|
||||||
"""
|
|
||||||
# Handle path or content
|
|
||||||
if os.path.isfile(bbl):
|
|
||||||
with open(bbl, 'r') as fh:
|
|
||||||
bbl_content = fh.read()
|
|
||||||
else:
|
|
||||||
bbl_content = bbl
|
|
||||||
# Get a list of bibitems
|
|
||||||
bibitems = regex_bibitems.split(bbl_content)[1:]
|
|
||||||
bibitems = [regex_endthebibliography.sub("",
|
|
||||||
i).strip() for i in bibitems]
|
|
||||||
cleaned_bbl = []
|
|
||||||
# Clean every bibitem
|
|
||||||
for bibitem in bibitems:
|
|
||||||
cleaned_bbl.append(clean_bibitem(bibitem))
|
|
||||||
return cleaned_bbl
|
|
||||||
|
|
||||||
|
|
||||||
def extract_doi_links(urls):
|
|
||||||
"""
|
|
||||||
Try to find a DOI from a given list of URLs.
|
|
||||||
"""
|
|
||||||
doi_urls = [url for url in urls if "/doi/" in url]
|
|
||||||
if len(doi_urls) > 0:
|
|
||||||
return ("http://dx.doi.org" +
|
|
||||||
doi_urls[0][doi_urls[0].find("/doi/") + 4:])
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def extract_arxiv_links(urls):
|
|
||||||
"""
|
|
||||||
Try to find an arXiv link from a given list of URLs.
|
|
||||||
"""
|
|
||||||
arxiv_urls = [url for url in urls if "://arxiv.org" in url]
|
|
||||||
if len(arxiv_urls) > 0:
|
|
||||||
return arxiv_urls[0]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
|
|
||||||
"""
|
|
||||||
Search for a valid article ID (DOI or ArXiv) in the given text
|
|
||||||
(regex-based).
|
|
||||||
|
|
||||||
Returns a tuple (type, first matching ID) or None if not found.
|
|
||||||
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
|
||||||
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
|
||||||
"""
|
|
||||||
text = text.lower()
|
|
||||||
# Try to extract DOI
|
|
||||||
if "DOI" in only:
|
|
||||||
extractID = regex_doi.search(text.replace('Œ', '-'))
|
|
||||||
if not extractID:
|
|
||||||
# PNAS fix
|
|
||||||
extractID = regex_doi_pnas.search(text.
|
|
||||||
replace('pnas', '/pnas'))
|
|
||||||
if not extractID:
|
|
||||||
# JSB fix
|
|
||||||
extractID = regex_doi_jsb.search(text)
|
|
||||||
if extractID:
|
|
||||||
# If DOI extracted, clean it and return it
|
|
||||||
cleanDOI = False
|
|
||||||
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
|
|
||||||
if regex_clean_doi.search(cleanDOI):
|
|
||||||
cleanDOI = cleanDOI[1:]
|
|
||||||
# FABSE J fix
|
|
||||||
if regex_clean_doi_fabse.search(cleanDOI):
|
|
||||||
cleanDOI = cleanDOI[:20]
|
|
||||||
# Second JCB fix
|
|
||||||
if regex_clean_doi_jcb.search(cleanDOI):
|
|
||||||
cleanDOI = cleanDOI[:21]
|
|
||||||
if len(cleanDOI) > 40:
|
|
||||||
cleanDOItemp = regex_clean_doi_len.sub('000', cleanDOI)
|
|
||||||
reps = {'.': 'A', '-': '0'}
|
|
||||||
cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
|
|
||||||
digitStart = 0
|
|
||||||
for i in range(len(cleanDOItemp)):
|
|
||||||
if cleanDOItemp[i].isdigit():
|
|
||||||
digitStart = 1
|
|
||||||
if cleanDOItemp[i].isalpha() and digitStart:
|
|
||||||
break
|
|
||||||
cleanDOI = cleanDOI[0:(8+i)]
|
|
||||||
return ("DOI", cleanDOI)
|
|
||||||
# Else, try to extract arXiv
|
|
||||||
if "arXiv" in only:
|
|
||||||
extractID = regex_arXiv.search(text)
|
|
||||||
if extractID:
|
|
||||||
return ("arXiv", extractID.group(1))
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def dois_from_bbl(bbl):
|
|
||||||
"""
|
|
||||||
Get the papers cited by the paper identified by the given DOI.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
- bbl is either the path to the .bbl file or the content of a bbl file.
|
|
||||||
|
|
||||||
Returns a dict of cleaned plaintext citations and their associated doi.
|
|
||||||
"""
|
|
||||||
cleaned_citations_with_URLs = parse_bbl(bbl)
|
|
||||||
dois = {}
|
|
||||||
cleaned_citations = []
|
|
||||||
# Try to get the DOI directly from the citation
|
|
||||||
for citation in cleaned_citations_with_URLs[:]:
|
|
||||||
# Get all the urls in the citation
|
|
||||||
raw_urls = regex_urls.findall(citation)
|
|
||||||
urls = [u.lower() for u in raw_urls]
|
|
||||||
# Remove URLs in citation
|
|
||||||
for url in raw_urls:
|
|
||||||
citation = citation.replace(url, "")
|
|
||||||
citation = clean_whitespaces(citation)
|
|
||||||
# Try to find an arXiv link
|
|
||||||
arxiv_url = extract_arxiv_links(urls)
|
|
||||||
if arxiv_url:
|
|
||||||
dois[citation] = arxiv_url
|
|
||||||
# Try to find a DOI link
|
|
||||||
doi_url = extract_doi_links(urls)
|
|
||||||
if doi_url:
|
|
||||||
dois[citation] = doi_url
|
|
||||||
# Try to find a direct match using a regex if links search failed
|
|
||||||
if not doi_url and not arxiv_url:
|
|
||||||
regex_match = match_doi_or_arxiv(citation)
|
|
||||||
if regex_match:
|
|
||||||
print(regex_match)
|
|
||||||
citation = citation.replace(regex_match[1], "")
|
|
||||||
if regex_match[0] == "DOI":
|
|
||||||
dois[citation] = "http://dx.doi.org/%s" % (regex_match[1],)
|
|
||||||
else:
|
|
||||||
dois[citation] = (
|
|
||||||
"http://arxiv.org/abs/%s" %
|
|
||||||
(regex_match[1].replace("arxiv:", ""),)
|
|
||||||
)
|
|
||||||
# If no match found, stack it for next step
|
|
||||||
if citation not in dois:
|
|
||||||
cleaned_citations.append(citation)
|
|
||||||
# Do batch of 10 papers, to prevent from the timeout of crossref
|
|
||||||
for i in range(math.ceil(len(cleaned_citations) / 10)):
|
|
||||||
lower_bound = 10 * i
|
|
||||||
upper_bound = min(10 * (i + 1), len(cleaned_citations))
|
|
||||||
r = requests.post("http://search.crossref.org/links",
|
|
||||||
json=cleaned_citations[lower_bound:upper_bound])
|
|
||||||
for result in r.json()["results"]:
|
|
||||||
if "doi" not in result:
|
|
||||||
# If DOI is not found, try a direct query to get a DOI
|
|
||||||
# r = requests.get("http://search.crossref.org/dois",
|
|
||||||
# params={
|
|
||||||
# 'q': result["text"],
|
|
||||||
# "sort": "score",
|
|
||||||
# "rows": 1
|
|
||||||
# })
|
|
||||||
# doi_result = r.json()
|
|
||||||
# if len(doi_result) > 0:
|
|
||||||
# dois[result["text"]] = doi_result[0]["doi"]
|
|
||||||
# else:
|
|
||||||
# dois[result["text"]] = None
|
|
||||||
dois[result["text"]] = None
|
|
||||||
else:
|
|
||||||
dois[result["text"]] = result["doi"]
|
|
||||||
return dois
|
|
||||||
|
|
||||||
|
|
||||||
def sources_from_arxiv(eprint):
|
|
||||||
"""
|
|
||||||
Download sources on arXiv for a given preprint.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
|
||||||
|
|
||||||
Returns a TarFile object of the sources of the arXiv preprint.
|
|
||||||
"""
|
|
||||||
r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
|
|
||||||
file_object = io.BytesIO(r.content)
|
|
||||||
return tarfile.open(fileobj=file_object)
|
|
||||||
|
|
||||||
|
|
||||||
def bbl_from_arxiv(eprint):
|
|
||||||
"""
|
|
||||||
Get the .bbl files (if any) of a given preprint.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
|
||||||
|
|
||||||
Returns a list of the .bbl files as text (if any) or None.
|
|
||||||
"""
|
|
||||||
tf = sources_from_arxiv(eprint)
|
|
||||||
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
|
||||||
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
|
||||||
for member in bbl_files]
|
|
||||||
return bbl_files
|
|
||||||
|
|
||||||
|
|
||||||
def dois_from_arxiv(eprint):
|
|
||||||
"""
|
|
||||||
Get the .bbl files (if any) of a given preprint.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
|
|
||||||
|
|
||||||
Returns a dict of cleaned plaintext citations and their associated doi.
|
|
||||||
"""
|
|
||||||
bbl_files = bbl_from_arxiv(eprint)
|
|
||||||
dois = {}
|
|
||||||
for bbl in bbl_files:
|
|
||||||
dois.update(dois_from_bbl(bbl))
|
|
||||||
return dois
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import pprint
|
import pprint
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
|
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
|
||||||
|
|
||||||
if os.path.isfile(sys.argv[1]):
|
if os.path.isfile(sys.argv[1]):
|
||||||
pprint.pprint(dois_from_bbl(sys.argv[1]))
|
pprint.pprint(bbl.get_dois(sys.argv[1]))
|
||||||
else:
|
else:
|
||||||
pprint.pprint(dois_from_arxiv(sys.argv[1]))
|
pprint.pprint(arxiv.get_dois(sys.argv[1]))
|
||||||
|
14
regex.py
Normal file
14
regex.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
|
||||||
|
bibitems = re.compile(r"\\bibitem\{.+?\}")
|
||||||
|
endthebibliography = re.compile(r"\\end\{thebibliography}")
|
||||||
|
|
||||||
|
doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
|
||||||
|
doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
|
||||||
|
doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
|
||||||
|
clean_doi = re.compile('^/')
|
||||||
|
clean_doi_fabse = re.compile('^10.1096')
|
||||||
|
clean_doi_jcb = re.compile('^10.1083')
|
||||||
|
clean_doi_len = re.compile(r'\d\.\d')
|
||||||
|
arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
12
tools.py
Normal file
12
tools.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
def replaceAll(text, dic):
|
||||||
|
"""Replace all the dic keys by the associated item in text"""
|
||||||
|
for i, j in dic.items():
|
||||||
|
text = text.replace(i, j)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def clean_whitespaces(text):
|
||||||
|
"""
|
||||||
|
Remove double whitespaces and trailing . and , from text.
|
||||||
|
"""
|
||||||
|
return ' '.join(text.strip().rstrip(".,").split())
|
Loading…
Reference in New Issue
Block a user