From 51df30918a92b6a59a35c8ab2767094a2e81e817 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Wed, 23 Dec 2015 22:49:14 +0100 Subject: [PATCH] Clean a bit the code --- .gitignore | 1 + arxiv.py | 50 ++++++++++ bbl.py | 124 ++++++++++++++++++++++++ doi.py | 77 +++++++++++++++ main.py | 276 +---------------------------------------------------- regex.py | 14 +++ tools.py | 12 +++ 7 files changed, 283 insertions(+), 271 deletions(-) create mode 100644 .gitignore create mode 100644 arxiv.py create mode 100644 bbl.py create mode 100644 doi.py create mode 100644 regex.py create mode 100644 tools.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/arxiv.py b/arxiv.py new file mode 100644 index 0000000..f9dc958 --- /dev/null +++ b/arxiv.py @@ -0,0 +1,50 @@ +import bbl +import io +import requests +import tarfile + + +def sources_from_arxiv(eprint): + """ + Download sources on arXiv for a given preprint. + + Params: + - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). + + Returns a TarFile object of the sources of the arXiv preprint. + """ + r = requests.get("http://arxiv.org/e-print/%s" % (eprint,)) + file_object = io.BytesIO(r.content) + return tarfile.open(fileobj=file_object) + + +def bbl_from_arxiv(eprint): + """ + Get the .bbl files (if any) of a given preprint. + + Params: + - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). + + Returns a list of the .bbl files as text (if any) or None. + """ + tf = sources_from_arxiv(eprint) + bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")] + bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING) + for member in bbl_files] + return bbl_files + + +def get_dois(eprint): + """ + Get the .bbl files (if any) of a given preprint. + + Params: + - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). + + Returns a dict of cleaned plaintext citations and their associated doi. + """ + bbl_files = bbl_from_arxiv(eprint) + dois = {} + for bbl_file in bbl_files: + dois.update(bbl.get_dois(bbl_file)) + return dois diff --git a/bbl.py b/bbl.py new file mode 100644 index 0000000..97ace12 --- /dev/null +++ b/bbl.py @@ -0,0 +1,124 @@ +import doi +import math +import os +import requests +import subprocess + +import regex +import tools + + +def clean_bibitem(bibitem): + """ + Return a plaintext representation of the bibitem from the bbl file. + + Params: + - bibitem is the text content of the bibitem. + + Returns a cleaned plaintext citation from the bibitem. + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,), + "-s"], + input=bibitem.encode("utf-8")) + output = output.decode("utf-8") + output = tools.clean_whitespaces(output) + return output + + +def parse(bbl): + """ + Parse a *.bbl file to get a clean list of plaintext citations. + + Params: + - bbl is either the path to the .bbl file or the content of a bbl file. + + Returns a list of cleaned plaintext citations. + """ + # Handle path or content + if os.path.isfile(bbl): + with open(bbl, 'r') as fh: + bbl_content = fh.read() + else: + bbl_content = bbl + # Get a list of bibitems + bibitems = regex.bibitems.split(bbl_content)[1:] + bibitems = [regex.endthebibliography.sub("", + i).strip() for i in bibitems] + cleaned_bbl = [] + # Clean every bibitem + for bibitem in bibitems: + cleaned_bbl.append(clean_bibitem(bibitem)) + return cleaned_bbl + + +def get_dois(bbl_input): + """ + Get the papers cited by the paper identified by the given DOI. + + Params: + - bbl_input is either the path to the .bbl file or the content of a bbl + file. + + Returns a dict of cleaned plaintext citations and their associated doi. + """ + cleaned_citations_with_URLs = parse(bbl_input) + dois = {} + cleaned_citations = [] + # Try to get the DOI directly from the citation + for citation in cleaned_citations_with_URLs[:]: + # Get all the urls in the citation + raw_urls = regex.urls.findall(citation) + urls = [u.lower() for u in raw_urls] + # Remove URLs in citation + for url in raw_urls: + citation = citation.replace(url, "") + citation = tools.clean_whitespaces(citation) + # Try to find an arXiv link + arxiv_url = doi.extract_arxiv_links(urls) + if arxiv_url: + dois[citation] = arxiv_url + # Try to find a DOI link + doi_url = doi.extract_doi_links(urls) + if doi_url: + dois[citation] = doi_url + # Try to find a direct match using a regex if links search failed + if not doi_url and not arxiv_url: + regex.match = doi.match_doi_or_arxiv(citation) + if regex.match: + print(regex.match) + citation = citation.replace(regex.match[1], "") + if regex.match[0] == "DOI": + dois[citation] = "http://dx.doi.org/%s" % (regex.match[1],) + else: + dois[citation] = ( + "http://arxiv.org/abs/%s" % + (regex.match[1].replace("arxiv:", ""),) + ) + # If no match found, stack it for next step + if citation not in dois: + cleaned_citations.append(citation) + # Do batch of 10 papers, to prevent from the timeout of crossref + for i in range(math.ceil(len(cleaned_citations) / 10)): + lower_bound = 10 * i + upper_bound = min(10 * (i + 1), len(cleaned_citations)) + r = requests.post("http://search.crossref.org/links", + json=cleaned_citations[lower_bound:upper_bound]) + for result in r.json()["results"]: + if "doi" not in result: + # If DOI is not found, try a direct query to get a DOI + # r = requests.get("http://search.crossref.org/dois", + # params={ + # 'q': result["text"], + # "sort": "score", + # "rows": 1 + # }) + # doi_result = r.json() + # if len(doi_result) > 0: + # dois[result["text"]] = doi_result[0]["doi"] + # else: + # dois[result["text"]] = None + dois[result["text"]] = None + else: + dois[result["text"]] = result["doi"] + return dois diff --git a/doi.py b/doi.py new file mode 100644 index 0000000..76c69a6 --- /dev/null +++ b/doi.py @@ -0,0 +1,77 @@ +import regex +import tools + + +def extract_doi_links(urls): + """ + Try to find a DOI from a given list of URLs. + """ + doi_urls = [url for url in urls if "/doi/" in url] + if len(doi_urls) > 0: + return ("http://dx.doi.org" + + doi_urls[0][doi_urls[0].find("/doi/") + 4:]) + else: + return None + + +def extract_arxiv_links(urls): + """ + Try to find an arXiv link from a given list of URLs. + """ + arxiv_urls = [url for url in urls if "://arxiv.org" in url] + if len(arxiv_urls) > 0: + return arxiv_urls[0] + else: + return None + + +def match_doi_or_arxiv(text, only=["DOI", "arXiv"]): + """ + Search for a valid article ID (DOI or ArXiv) in the given text + (regex-based). + + Returns a tuple (type, first matching ID) or None if not found. + From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ + and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb + """ + text = text.lower() + # Try to extract DOI + if "DOI" in only: + extractID = regex.doi.search(text.replace('Œ', '-')) + if not extractID: + # PNAS fix + extractID = regex.doi_pnas.search(text. + replace('pnas', '/pnas')) + if not extractID: + # JSB fix + extractID = regex.doi_jsb.search(text) + if extractID: + # If DOI extracted, clean it and return it + cleanDOI = False + cleanDOI = extractID.group(0).replace(':', '').replace(' ', '') + if regex.clean_doi.search(cleanDOI): + cleanDOI = cleanDOI[1:] + # FABSE J fix + if regex.clean_doi_fabse.search(cleanDOI): + cleanDOI = cleanDOI[:20] + # Second JCB fix + if regex.clean_doi_jcb.search(cleanDOI): + cleanDOI = cleanDOI[:21] + if len(cleanDOI) > 40: + cleanDOItemp = regex.clean_doi_len.sub('000', cleanDOI) + reps = {'.': 'A', '-': '0'} + cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps) + digitStart = 0 + for i in range(len(cleanDOItemp)): + if cleanDOItemp[i].isdigit(): + digitStart = 1 + if cleanDOItemp[i].isalpha() and digitStart: + break + cleanDOI = cleanDOI[0:(8+i)] + return ("DOI", cleanDOI) + # Else, try to extract arXiv + if "arXiv" in only: + extractID = regex.arXiv.search(text) + if extractID: + return ("arXiv", extractID.group(1)) + return None diff --git a/main.py b/main.py index ec08aac..f2fd229 100755 --- a/main.py +++ b/main.py @@ -1,40 +1,10 @@ #!/usr/bin/env python3 -import io -import math import os -import re -import requests -import subprocess import sys -import tarfile - -regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") -regex_bibitems = re.compile(r"\\bibitem\{.+?\}") -regex_endthebibliography = re.compile(r"\\end\{thebibliography}") - -regex_doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE) -regex_doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE) -regex_doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE) -regex_clean_doi = re.compile('^/') -regex_clean_doi_fabse = re.compile('^10.1096') -regex_clean_doi_jcb = re.compile('^10.1083') -regex_clean_doi_len = re.compile(r'\d\.\d') -regex_arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE) - - -def replaceAll(text, dic): - """Replace all the dic keys by the associated item in text""" - for i, j in dic.items(): - text = text.replace(i, j) - return text - - -def clean_whitespaces(text): - """ - Remove double whitespaces and trailing . and , from text. - """ - return ' '.join(text.strip().rstrip(".,").split()) +# Local import +import arxiv +import bbl def oa_from_doi(doi): @@ -45,248 +15,12 @@ def oa_from_doi(doi): pass -def clean_bibitem(bibitem): - """ - Return a plaintext representation of the bibitem from the bbl file. - - Params: - - bibitem is the text content of the bibitem. - - Returns a cleaned plaintext citation from the bibitem. - """ - script_dir = os.path.dirname(os.path.abspath(__file__)) - output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,), - "-s"], - input=bibitem.encode("utf-8")) - output = output.decode("utf-8") - output = clean_whitespaces(output) - return output - - -def parse_bbl(bbl): - """ - Parse a *.bbl file to get a clean list of plaintext citations. - - Params: - - bbl is either the path to the .bbl file or the content of a bbl file. - - Returns a list of cleaned plaintext citations. - """ - # Handle path or content - if os.path.isfile(bbl): - with open(bbl, 'r') as fh: - bbl_content = fh.read() - else: - bbl_content = bbl - # Get a list of bibitems - bibitems = regex_bibitems.split(bbl_content)[1:] - bibitems = [regex_endthebibliography.sub("", - i).strip() for i in bibitems] - cleaned_bbl = [] - # Clean every bibitem - for bibitem in bibitems: - cleaned_bbl.append(clean_bibitem(bibitem)) - return cleaned_bbl - - -def extract_doi_links(urls): - """ - Try to find a DOI from a given list of URLs. - """ - doi_urls = [url for url in urls if "/doi/" in url] - if len(doi_urls) > 0: - return ("http://dx.doi.org" + - doi_urls[0][doi_urls[0].find("/doi/") + 4:]) - else: - return None - - -def extract_arxiv_links(urls): - """ - Try to find an arXiv link from a given list of URLs. - """ - arxiv_urls = [url for url in urls if "://arxiv.org" in url] - if len(arxiv_urls) > 0: - return arxiv_urls[0] - else: - return None - - -def match_doi_or_arxiv(text, only=["DOI", "arXiv"]): - """ - Search for a valid article ID (DOI or ArXiv) in the given text - (regex-based). - - Returns a tuple (type, first matching ID) or None if not found. - From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ - and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb - """ - text = text.lower() - # Try to extract DOI - if "DOI" in only: - extractID = regex_doi.search(text.replace('Œ', '-')) - if not extractID: - # PNAS fix - extractID = regex_doi_pnas.search(text. - replace('pnas', '/pnas')) - if not extractID: - # JSB fix - extractID = regex_doi_jsb.search(text) - if extractID: - # If DOI extracted, clean it and return it - cleanDOI = False - cleanDOI = extractID.group(0).replace(':', '').replace(' ', '') - if regex_clean_doi.search(cleanDOI): - cleanDOI = cleanDOI[1:] - # FABSE J fix - if regex_clean_doi_fabse.search(cleanDOI): - cleanDOI = cleanDOI[:20] - # Second JCB fix - if regex_clean_doi_jcb.search(cleanDOI): - cleanDOI = cleanDOI[:21] - if len(cleanDOI) > 40: - cleanDOItemp = regex_clean_doi_len.sub('000', cleanDOI) - reps = {'.': 'A', '-': '0'} - cleanDOItemp = replaceAll(cleanDOItemp[8:], reps) - digitStart = 0 - for i in range(len(cleanDOItemp)): - if cleanDOItemp[i].isdigit(): - digitStart = 1 - if cleanDOItemp[i].isalpha() and digitStart: - break - cleanDOI = cleanDOI[0:(8+i)] - return ("DOI", cleanDOI) - # Else, try to extract arXiv - if "arXiv" in only: - extractID = regex_arXiv.search(text) - if extractID: - return ("arXiv", extractID.group(1)) - return None - - -def dois_from_bbl(bbl): - """ - Get the papers cited by the paper identified by the given DOI. - - Params: - - bbl is either the path to the .bbl file or the content of a bbl file. - - Returns a dict of cleaned plaintext citations and their associated doi. - """ - cleaned_citations_with_URLs = parse_bbl(bbl) - dois = {} - cleaned_citations = [] - # Try to get the DOI directly from the citation - for citation in cleaned_citations_with_URLs[:]: - # Get all the urls in the citation - raw_urls = regex_urls.findall(citation) - urls = [u.lower() for u in raw_urls] - # Remove URLs in citation - for url in raw_urls: - citation = citation.replace(url, "") - citation = clean_whitespaces(citation) - # Try to find an arXiv link - arxiv_url = extract_arxiv_links(urls) - if arxiv_url: - dois[citation] = arxiv_url - # Try to find a DOI link - doi_url = extract_doi_links(urls) - if doi_url: - dois[citation] = doi_url - # Try to find a direct match using a regex if links search failed - if not doi_url and not arxiv_url: - regex_match = match_doi_or_arxiv(citation) - if regex_match: - print(regex_match) - citation = citation.replace(regex_match[1], "") - if regex_match[0] == "DOI": - dois[citation] = "http://dx.doi.org/%s" % (regex_match[1],) - else: - dois[citation] = ( - "http://arxiv.org/abs/%s" % - (regex_match[1].replace("arxiv:", ""),) - ) - # If no match found, stack it for next step - if citation not in dois: - cleaned_citations.append(citation) - # Do batch of 10 papers, to prevent from the timeout of crossref - for i in range(math.ceil(len(cleaned_citations) / 10)): - lower_bound = 10 * i - upper_bound = min(10 * (i + 1), len(cleaned_citations)) - r = requests.post("http://search.crossref.org/links", - json=cleaned_citations[lower_bound:upper_bound]) - for result in r.json()["results"]: - if "doi" not in result: - # If DOI is not found, try a direct query to get a DOI - # r = requests.get("http://search.crossref.org/dois", - # params={ - # 'q': result["text"], - # "sort": "score", - # "rows": 1 - # }) - # doi_result = r.json() - # if len(doi_result) > 0: - # dois[result["text"]] = doi_result[0]["doi"] - # else: - # dois[result["text"]] = None - dois[result["text"]] = None - else: - dois[result["text"]] = result["doi"] - return dois - - -def sources_from_arxiv(eprint): - """ - Download sources on arXiv for a given preprint. - - Params: - - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). - - Returns a TarFile object of the sources of the arXiv preprint. - """ - r = requests.get("http://arxiv.org/e-print/%s" % (eprint,)) - file_object = io.BytesIO(r.content) - return tarfile.open(fileobj=file_object) - - -def bbl_from_arxiv(eprint): - """ - Get the .bbl files (if any) of a given preprint. - - Params: - - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). - - Returns a list of the .bbl files as text (if any) or None. - """ - tf = sources_from_arxiv(eprint) - bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")] - bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING) - for member in bbl_files] - return bbl_files - - -def dois_from_arxiv(eprint): - """ - Get the .bbl files (if any) of a given preprint. - - Params: - - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1). - - Returns a dict of cleaned plaintext citations and their associated doi. - """ - bbl_files = bbl_from_arxiv(eprint) - dois = {} - for bbl in bbl_files: - dois.update(dois_from_bbl(bbl)) - return dois - - if __name__ == "__main__": import pprint if len(sys.argv) < 2: sys.exit("Usage: " + sys.argv[0] + " BBL_FILE|ARXIV_EPRINT.") if os.path.isfile(sys.argv[1]): - pprint.pprint(dois_from_bbl(sys.argv[1])) + pprint.pprint(bbl.get_dois(sys.argv[1])) else: - pprint.pprint(dois_from_arxiv(sys.argv[1])) + pprint.pprint(arxiv.get_dois(sys.argv[1])) diff --git a/regex.py b/regex.py new file mode 100644 index 0000000..940f2de --- /dev/null +++ b/regex.py @@ -0,0 +1,14 @@ +import re + +urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") +bibitems = re.compile(r"\\bibitem\{.+?\}") +endthebibliography = re.compile(r"\\end\{thebibliography}") + +doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE) +doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE) +doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE) +clean_doi = re.compile('^/') +clean_doi_fabse = re.compile('^10.1096') +clean_doi_jcb = re.compile('^10.1083') +clean_doi_len = re.compile(r'\d\.\d') +arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE) diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..c793256 --- /dev/null +++ b/tools.py @@ -0,0 +1,12 @@ +def replaceAll(text, dic): + """Replace all the dic keys by the associated item in text""" + for i, j in dic.items(): + text = text.replace(i, j) + return text + + +def clean_whitespaces(text): + """ + Remove double whitespaces and trailing . and , from text. + """ + return ' '.join(text.strip().rstrip(".,").split())