Various re.compile

This commit is contained in:
Phyks 2014-04-29 21:55:35 +02:00
parent 7d36f3206f
commit 91685bc46b
2 changed files with 21 additions and 13 deletions

View File

@ -108,8 +108,6 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really
10. Refactor 10. Refactor
11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex 11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex
12. Rebuild function 12. Rebuild function
13. Split main.py
14. Various re.compile ?
15. Check output of subprocesses before it ends 15. Check output of subprocesses before it ends
16. TODO in files 16. TODO in files
20. No DOI for arXiv / HAL 20. No DOI for arXiv / HAL

View File

@ -40,6 +40,9 @@ def download(url):
return False return False
isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[-][0-9])")
def findISBN(src): def findISBN(src):
"""Search for a valid ISBN in src. """Search for a valid ISBN in src.
@ -59,8 +62,7 @@ def findISBN(src):
tools.warning(extractfull[1]) tools.warning(extractfull[1])
return False return False
extractfull = extractfull[0] extractfull = extractfull[0]
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])", extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-'))
extractfull.lower().replace('Œ', '-'))
cleanISBN = False cleanISBN = False
# Clean ISBN is the ISBN number without separators # Clean ISBN is the ISBN number without separators
if extractISBN: if extractISBN:
@ -78,6 +80,15 @@ def isbn2Bib(isbn):
return '' return ''
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]')
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+')
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}')
clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d')
def findDOI(src): def findDOI(src):
"""Search for a valid DOI in src. """Search for a valid DOI in src.
@ -99,29 +110,28 @@ def findDOI(src):
tools.warning(extractfull[1]) tools.warning(extractfull[1])
return False return False
extractfull = extractfull[0] extractfull = extractfull[0]
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', extractDOI = doi_re.search(extractfull.lower().replace('&#338;', '-'))
extractfull.lower().replace('&#338;', '-'))
if not extractDOI: if not extractDOI:
# PNAS fix # PNAS fix
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+', extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas',
extractfull.lower().replace('pnas', '/pnas')) '/pnas'))
if not extractDOI: if not extractDOI:
# JSB fix # JSB fix
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower()) extractDOI = doi_jsb_re.search(extractfull.lower())
cleanDOI = False cleanDOI = False
if extractDOI: if extractDOI:
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
if re.search('^/', cleanDOI): if clean_doi_re.search(cleanDOI):
cleanDOI = cleanDOI[1:] cleanDOI = cleanDOI[1:]
# FABSE J fix # FABSE J fix
if re.search('^10.1096', cleanDOI): if clean_doi_fabse_re.search(cleanDOI):
cleanDOI = cleanDOI[:20] cleanDOI = cleanDOI[:20]
# Second JCB fix # Second JCB fix
if re.search('^10.1083', cleanDOI): if clean_doi_jcb_re.search(cleanDOI):
cleanDOI = cleanDOI[:21] cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40: if len(cleanDOI) > 40:
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI) cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
reps = {'.': 'A', '-': '0'} reps = {'.': 'A', '-': '0'}
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps) cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
digitStart = 0 digitStart = 0