Clean a bit the code

This commit is contained in:
Lucas Verney 2015-12-23 22:49:14 +01:00
parent 97b90ee610
commit 51df30918a
7 changed files with 283 additions and 271 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

50
arxiv.py Normal file
View File

@ -0,0 +1,50 @@
import bbl
import io
import requests
import tarfile
def sources_from_arxiv(eprint):
"""
Download sources on arXiv for a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a TarFile object of the sources of the arXiv preprint.
"""
r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
file_object = io.BytesIO(r.content)
return tarfile.open(fileobj=file_object)
def bbl_from_arxiv(eprint):
"""
Get the .bbl files (if any) of a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a list of the .bbl files as text (if any) or None.
"""
tf = sources_from_arxiv(eprint)
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
for member in bbl_files]
return bbl_files
def get_dois(eprint):
"""
Get the .bbl files (if any) of a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a dict of cleaned plaintext citations and their associated doi.
"""
bbl_files = bbl_from_arxiv(eprint)
dois = {}
for bbl_file in bbl_files:
dois.update(bbl.get_dois(bbl_file))
return dois

124
bbl.py Normal file
View File

@ -0,0 +1,124 @@
import doi
import math
import os
import requests
import subprocess
import regex
import tools
def clean_bibitem(bibitem):
"""
Return a plaintext representation of the bibitem from the bbl file.
Params:
- bibitem is the text content of the bibitem.
Returns a cleaned plaintext citation from the bibitem.
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
"-s"],
input=bibitem.encode("utf-8"))
output = output.decode("utf-8")
output = tools.clean_whitespaces(output)
return output
def parse(bbl):
"""
Parse a *.bbl file to get a clean list of plaintext citations.
Params:
- bbl is either the path to the .bbl file or the content of a bbl file.
Returns a list of cleaned plaintext citations.
"""
# Handle path or content
if os.path.isfile(bbl):
with open(bbl, 'r') as fh:
bbl_content = fh.read()
else:
bbl_content = bbl
# Get a list of bibitems
bibitems = regex.bibitems.split(bbl_content)[1:]
bibitems = [regex.endthebibliography.sub("",
i).strip() for i in bibitems]
cleaned_bbl = []
# Clean every bibitem
for bibitem in bibitems:
cleaned_bbl.append(clean_bibitem(bibitem))
return cleaned_bbl
def get_dois(bbl_input):
"""
Get the papers cited by the paper identified by the given DOI.
Params:
- bbl_input is either the path to the .bbl file or the content of a bbl
file.
Returns a dict of cleaned plaintext citations and their associated doi.
"""
cleaned_citations_with_URLs = parse(bbl_input)
dois = {}
cleaned_citations = []
# Try to get the DOI directly from the citation
for citation in cleaned_citations_with_URLs[:]:
# Get all the urls in the citation
raw_urls = regex.urls.findall(citation)
urls = [u.lower() for u in raw_urls]
# Remove URLs in citation
for url in raw_urls:
citation = citation.replace(url, "")
citation = tools.clean_whitespaces(citation)
# Try to find an arXiv link
arxiv_url = doi.extract_arxiv_links(urls)
if arxiv_url:
dois[citation] = arxiv_url
# Try to find a DOI link
doi_url = doi.extract_doi_links(urls)
if doi_url:
dois[citation] = doi_url
# Try to find a direct match using a regex if links search failed
if not doi_url and not arxiv_url:
regex.match = doi.match_doi_or_arxiv(citation)
if regex.match:
print(regex.match)
citation = citation.replace(regex.match[1], "")
if regex.match[0] == "DOI":
dois[citation] = "http://dx.doi.org/%s" % (regex.match[1],)
else:
dois[citation] = (
"http://arxiv.org/abs/%s" %
(regex.match[1].replace("arxiv:", ""),)
)
# If no match found, stack it for next step
if citation not in dois:
cleaned_citations.append(citation)
# Do batch of 10 papers, to prevent from the timeout of crossref
for i in range(math.ceil(len(cleaned_citations) / 10)):
lower_bound = 10 * i
upper_bound = min(10 * (i + 1), len(cleaned_citations))
r = requests.post("http://search.crossref.org/links",
json=cleaned_citations[lower_bound:upper_bound])
for result in r.json()["results"]:
if "doi" not in result:
# If DOI is not found, try a direct query to get a DOI
# r = requests.get("http://search.crossref.org/dois",
# params={
# 'q': result["text"],
# "sort": "score",
# "rows": 1
# })
# doi_result = r.json()
# if len(doi_result) > 0:
# dois[result["text"]] = doi_result[0]["doi"]
# else:
# dois[result["text"]] = None
dois[result["text"]] = None
else:
dois[result["text"]] = result["doi"]
return dois

77
doi.py Normal file
View File

@ -0,0 +1,77 @@
import regex
import tools
def extract_doi_links(urls):
"""
Try to find a DOI from a given list of URLs.
"""
doi_urls = [url for url in urls if "/doi/" in url]
if len(doi_urls) > 0:
return ("http://dx.doi.org" +
doi_urls[0][doi_urls[0].find("/doi/") + 4:])
else:
return None
def extract_arxiv_links(urls):
"""
Try to find an arXiv link from a given list of URLs.
"""
arxiv_urls = [url for url in urls if "://arxiv.org" in url]
if len(arxiv_urls) > 0:
return arxiv_urls[0]
else:
return None
def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
"""
Search for a valid article ID (DOI or ArXiv) in the given text
(regex-based).
Returns a tuple (type, first matching ID) or None if not found.
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
"""
text = text.lower()
# Try to extract DOI
if "DOI" in only:
extractID = regex.doi.search(text.replace('Œ', '-'))
if not extractID:
# PNAS fix
extractID = regex.doi_pnas.search(text.
replace('pnas', '/pnas'))
if not extractID:
# JSB fix
extractID = regex.doi_jsb.search(text)
if extractID:
# If DOI extracted, clean it and return it
cleanDOI = False
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
if regex.clean_doi.search(cleanDOI):
cleanDOI = cleanDOI[1:]
# FABSE J fix
if regex.clean_doi_fabse.search(cleanDOI):
cleanDOI = cleanDOI[:20]
# Second JCB fix
if regex.clean_doi_jcb.search(cleanDOI):
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
cleanDOItemp = regex.clean_doi_len.sub('000', cleanDOI)
reps = {'.': 'A', '-': '0'}
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return ("DOI", cleanDOI)
# Else, try to extract arXiv
if "arXiv" in only:
extractID = regex.arXiv.search(text)
if extractID:
return ("arXiv", extractID.group(1))
return None

276
main.py
View File

@ -1,40 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import io
import math
import os import os
import re
import requests
import subprocess
import sys import sys
import tarfile
# Local import
regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") import arxiv
regex_bibitems = re.compile(r"\\bibitem\{.+?\}") import bbl
regex_endthebibliography = re.compile(r"\\end\{thebibliography}")
regex_doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
regex_doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
regex_doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
regex_clean_doi = re.compile('^/')
regex_clean_doi_fabse = re.compile('^10.1096')
regex_clean_doi_jcb = re.compile('^10.1083')
regex_clean_doi_len = re.compile(r'\d\.\d')
regex_arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
def replaceAll(text, dic):
"""Replace all the dic keys by the associated item in text"""
for i, j in dic.items():
text = text.replace(i, j)
return text
def clean_whitespaces(text):
"""
Remove double whitespaces and trailing . and , from text.
"""
return ' '.join(text.strip().rstrip(".,").split())
def oa_from_doi(doi): def oa_from_doi(doi):
@ -45,248 +15,12 @@ def oa_from_doi(doi):
pass pass
def clean_bibitem(bibitem):
"""
Return a plaintext representation of the bibitem from the bbl file.
Params:
- bibitem is the text content of the bibitem.
Returns a cleaned plaintext citation from the bibitem.
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
"-s"],
input=bibitem.encode("utf-8"))
output = output.decode("utf-8")
output = clean_whitespaces(output)
return output
def parse_bbl(bbl):
"""
Parse a *.bbl file to get a clean list of plaintext citations.
Params:
- bbl is either the path to the .bbl file or the content of a bbl file.
Returns a list of cleaned plaintext citations.
"""
# Handle path or content
if os.path.isfile(bbl):
with open(bbl, 'r') as fh:
bbl_content = fh.read()
else:
bbl_content = bbl
# Get a list of bibitems
bibitems = regex_bibitems.split(bbl_content)[1:]
bibitems = [regex_endthebibliography.sub("",
i).strip() for i in bibitems]
cleaned_bbl = []
# Clean every bibitem
for bibitem in bibitems:
cleaned_bbl.append(clean_bibitem(bibitem))
return cleaned_bbl
def extract_doi_links(urls):
"""
Try to find a DOI from a given list of URLs.
"""
doi_urls = [url for url in urls if "/doi/" in url]
if len(doi_urls) > 0:
return ("http://dx.doi.org" +
doi_urls[0][doi_urls[0].find("/doi/") + 4:])
else:
return None
def extract_arxiv_links(urls):
"""
Try to find an arXiv link from a given list of URLs.
"""
arxiv_urls = [url for url in urls if "://arxiv.org" in url]
if len(arxiv_urls) > 0:
return arxiv_urls[0]
else:
return None
def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
"""
Search for a valid article ID (DOI or ArXiv) in the given text
(regex-based).
Returns a tuple (type, first matching ID) or None if not found.
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
"""
text = text.lower()
# Try to extract DOI
if "DOI" in only:
extractID = regex_doi.search(text.replace('&#338;', '-'))
if not extractID:
# PNAS fix
extractID = regex_doi_pnas.search(text.
replace('pnas', '/pnas'))
if not extractID:
# JSB fix
extractID = regex_doi_jsb.search(text)
if extractID:
# If DOI extracted, clean it and return it
cleanDOI = False
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
if regex_clean_doi.search(cleanDOI):
cleanDOI = cleanDOI[1:]
# FABSE J fix
if regex_clean_doi_fabse.search(cleanDOI):
cleanDOI = cleanDOI[:20]
# Second JCB fix
if regex_clean_doi_jcb.search(cleanDOI):
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
cleanDOItemp = regex_clean_doi_len.sub('000', cleanDOI)
reps = {'.': 'A', '-': '0'}
cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return ("DOI", cleanDOI)
# Else, try to extract arXiv
if "arXiv" in only:
extractID = regex_arXiv.search(text)
if extractID:
return ("arXiv", extractID.group(1))
return None
def dois_from_bbl(bbl):
"""
Get the papers cited by the paper identified by the given DOI.
Params:
- bbl is either the path to the .bbl file or the content of a bbl file.
Returns a dict of cleaned plaintext citations and their associated doi.
"""
cleaned_citations_with_URLs = parse_bbl(bbl)
dois = {}
cleaned_citations = []
# Try to get the DOI directly from the citation
for citation in cleaned_citations_with_URLs[:]:
# Get all the urls in the citation
raw_urls = regex_urls.findall(citation)
urls = [u.lower() for u in raw_urls]
# Remove URLs in citation
for url in raw_urls:
citation = citation.replace(url, "")
citation = clean_whitespaces(citation)
# Try to find an arXiv link
arxiv_url = extract_arxiv_links(urls)
if arxiv_url:
dois[citation] = arxiv_url
# Try to find a DOI link
doi_url = extract_doi_links(urls)
if doi_url:
dois[citation] = doi_url
# Try to find a direct match using a regex if links search failed
if not doi_url and not arxiv_url:
regex_match = match_doi_or_arxiv(citation)
if regex_match:
print(regex_match)
citation = citation.replace(regex_match[1], "")
if regex_match[0] == "DOI":
dois[citation] = "http://dx.doi.org/%s" % (regex_match[1],)
else:
dois[citation] = (
"http://arxiv.org/abs/%s" %
(regex_match[1].replace("arxiv:", ""),)
)
# If no match found, stack it for next step
if citation not in dois:
cleaned_citations.append(citation)
# Do batch of 10 papers, to prevent from the timeout of crossref
for i in range(math.ceil(len(cleaned_citations) / 10)):
lower_bound = 10 * i
upper_bound = min(10 * (i + 1), len(cleaned_citations))
r = requests.post("http://search.crossref.org/links",
json=cleaned_citations[lower_bound:upper_bound])
for result in r.json()["results"]:
if "doi" not in result:
# If DOI is not found, try a direct query to get a DOI
# r = requests.get("http://search.crossref.org/dois",
# params={
# 'q': result["text"],
# "sort": "score",
# "rows": 1
# })
# doi_result = r.json()
# if len(doi_result) > 0:
# dois[result["text"]] = doi_result[0]["doi"]
# else:
# dois[result["text"]] = None
dois[result["text"]] = None
else:
dois[result["text"]] = result["doi"]
return dois
def sources_from_arxiv(eprint):
"""
Download sources on arXiv for a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a TarFile object of the sources of the arXiv preprint.
"""
r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
file_object = io.BytesIO(r.content)
return tarfile.open(fileobj=file_object)
def bbl_from_arxiv(eprint):
"""
Get the .bbl files (if any) of a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a list of the .bbl files as text (if any) or None.
"""
tf = sources_from_arxiv(eprint)
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
for member in bbl_files]
return bbl_files
def dois_from_arxiv(eprint):
"""
Get the .bbl files (if any) of a given preprint.
Params:
- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).
Returns a dict of cleaned plaintext citations and their associated doi.
"""
bbl_files = bbl_from_arxiv(eprint)
dois = {}
for bbl in bbl_files:
dois.update(dois_from_bbl(bbl))
return dois
if __name__ == "__main__": if __name__ == "__main__":
import pprint import pprint
if len(sys.argv) < 2: if len(sys.argv) < 2:
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.") sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.")
if os.path.isfile(sys.argv[1]): if os.path.isfile(sys.argv[1]):
pprint.pprint(dois_from_bbl(sys.argv[1])) pprint.pprint(bbl.get_dois(sys.argv[1]))
else: else:
pprint.pprint(dois_from_arxiv(sys.argv[1])) pprint.pprint(arxiv.get_dois(sys.argv[1]))

14
regex.py Normal file
View File

@ -0,0 +1,14 @@
import re
urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
bibitems = re.compile(r"\\bibitem\{.+?\}")
endthebibliography = re.compile(r"\\end\{thebibliography}")
doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
clean_doi = re.compile('^/')
clean_doi_fabse = re.compile('^10.1096')
clean_doi_jcb = re.compile('^10.1083')
clean_doi_len = re.compile(r'\d\.\d')
arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)

12
tools.py Normal file
View File

@ -0,0 +1,12 @@
def replaceAll(text, dic):
"""Replace all the dic keys by the associated item in text"""
for i, j in dic.items():
text = text.replace(i, j)
return text
def clean_whitespaces(text):
"""
Remove double whitespaces and trailing . and , from text.
"""
return ' '.join(text.strip().rstrip(".,").split())