From 97b90ee6103d1c72d901c86ecb2ac25cd69b55d9 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Wed, 23 Dec 2015 22:37:50 +0100 Subject: [PATCH] Add a regex matching of DOI / arXiv ids --- main.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/main.py b/main.py index 1e2613a..ec08aac 100755 --- a/main.py +++ b/main.py @@ -13,6 +13,22 @@ regex_urls = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[ regex_bibitems = re.compile(r"\\bibitem\{.+?\}") regex_endthebibliography = re.compile(r"\\end\{thebibliography}") +regex_doi = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE) +regex_doi_pnas = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE) +regex_doi_jsb = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE) +regex_clean_doi = re.compile('^/') +regex_clean_doi_fabse = re.compile('^10.1096') +regex_clean_doi_jcb = re.compile('^10.1083') +regex_clean_doi_len = re.compile(r'\d\.\d') +regex_arXiv = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE) + + +def replaceAll(text, dic): + """Replace all the dic keys by the associated item in text""" + for i, j in dic.items(): + text = text.replace(i, j) + return text + def clean_whitespaces(text): """ @@ -96,6 +112,58 @@ def extract_arxiv_links(urls): return None +def match_doi_or_arxiv(text, only=["DOI", "arXiv"]): + """ + Search for a valid article ID (DOI or ArXiv) in the given text + (regex-based). + + Returns a tuple (type, first matching ID) or None if not found. + From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ + and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb + """ + text = text.lower() + # Try to extract DOI + if "DOI" in only: + extractID = regex_doi.search(text.replace('Œ', '-')) + if not extractID: + # PNAS fix + extractID = regex_doi_pnas.search(text. + replace('pnas', '/pnas')) + if not extractID: + # JSB fix + extractID = regex_doi_jsb.search(text) + if extractID: + # If DOI extracted, clean it and return it + cleanDOI = False + cleanDOI = extractID.group(0).replace(':', '').replace(' ', '') + if regex_clean_doi.search(cleanDOI): + cleanDOI = cleanDOI[1:] + # FABSE J fix + if regex_clean_doi_fabse.search(cleanDOI): + cleanDOI = cleanDOI[:20] + # Second JCB fix + if regex_clean_doi_jcb.search(cleanDOI): + cleanDOI = cleanDOI[:21] + if len(cleanDOI) > 40: + cleanDOItemp = regex_clean_doi_len.sub('000', cleanDOI) + reps = {'.': 'A', '-': '0'} + cleanDOItemp = replaceAll(cleanDOItemp[8:], reps) + digitStart = 0 + for i in range(len(cleanDOItemp)): + if cleanDOItemp[i].isdigit(): + digitStart = 1 + if cleanDOItemp[i].isalpha() and digitStart: + break + cleanDOI = cleanDOI[0:(8+i)] + return ("DOI", cleanDOI) + # Else, try to extract arXiv + if "arXiv" in only: + extractID = regex_arXiv.search(text) + if extractID: + return ("arXiv", extractID.group(1)) + return None + + def dois_from_bbl(bbl): """ Get the papers cited by the paper identified by the given DOI. @@ -125,6 +193,19 @@ def dois_from_bbl(bbl): doi_url = extract_doi_links(urls) if doi_url: dois[citation] = doi_url + # Try to find a direct match using a regex if links search failed + if not doi_url and not arxiv_url: + regex_match = match_doi_or_arxiv(citation) + if regex_match: + print(regex_match) + citation = citation.replace(regex_match[1], "") + if regex_match[0] == "DOI": + dois[citation] = "http://dx.doi.org/%s" % (regex_match[1],) + else: + dois[citation] = ( + "http://arxiv.org/abs/%s" % + (regex_match[1].replace("arxiv:", ""),) + ) # If no match found, stack it for next step if citation not in dois: cleaned_citations.append(citation)