From 91685bc46b185adaa5d0ad12edbad816ced6a028 Mon Sep 17 00:00:00 2001 From: Phyks Date: Tue, 29 Apr 2014 21:55:35 +0200 Subject: [PATCH] Various re.compile --- README.md | 2 -- fetcher.py | 32 +++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 05553b9..4f2053d 100644 --- a/README.md +++ b/README.md @@ -108,8 +108,6 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really 10. Refactor 11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex 12. Rebuild function - 13. Split main.py - 14. Various re.compile ? 15. Check output of subprocesses before it ends 16. TODO in files 20. No DOI for arXiv / HAL diff --git a/fetcher.py b/fetcher.py index 2f1276b..297a6c7 100644 --- a/fetcher.py +++ b/fetcher.py @@ -40,6 +40,9 @@ def download(url): return False +isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[-][0-9])") + + def findISBN(src): """Search for a valid ISBN in src. @@ -59,8 +62,7 @@ def findISBN(src): tools.warning(extractfull[1]) return False extractfull = extractfull[0] - extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])", - extractfull.lower().replace('Œ', '-')) + extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-')) cleanISBN = False # Clean ISBN is the ISBN number without separators if extractISBN: @@ -78,6 +80,15 @@ def isbn2Bib(isbn): return '' +doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]') +doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+') +doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}') +clean_doi_re = re.compile('^/') +clean_doi_fabse_re = re.compile('^10.1096') +clean_doi_jcb_re = re.compile('^10.1083') +clean_doi_len_re = re.compile(r'\d\.\d') + + def findDOI(src): """Search for a valid DOI in src. @@ -99,29 +110,28 @@ def findDOI(src): tools.warning(extractfull[1]) return False extractfull = extractfull[0] - extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', - extractfull.lower().replace('Œ', '-')) + extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) if not extractDOI: # PNAS fix - extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+', - extractfull.lower().replace('pnas', '/pnas')) + extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas', + '/pnas')) if not extractDOI: # JSB fix - extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower()) + extractDOI = doi_jsb_re.search(extractfull.lower()) cleanDOI = False if extractDOI: cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') - if re.search('^/', cleanDOI): + if clean_doi_re.search(cleanDOI): cleanDOI = cleanDOI[1:] # FABSE J fix - if re.search('^10.1096', cleanDOI): + if clean_doi_fabse_re.search(cleanDOI): cleanDOI = cleanDOI[:20] # Second JCB fix - if re.search('^10.1083', cleanDOI): + if clean_doi_jcb_re.search(cleanDOI): cleanDOI = cleanDOI[:21] if len(cleanDOI) > 40: - cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI) + cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI) reps = {'.': 'A', '-': '0'} cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps) digitStart = 0