Various re.compile
This commit is contained in:
parent
7d36f3206f
commit
91685bc46b
@ -108,8 +108,6 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really
|
||||
10. Refactor
|
||||
11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex
|
||||
12. Rebuild function
|
||||
13. Split main.py
|
||||
14. Various re.compile ?
|
||||
15. Check output of subprocesses before it ends
|
||||
16. TODO in files
|
||||
20. No DOI for arXiv / HAL
|
||||
|
32
fetcher.py
32
fetcher.py
@ -40,6 +40,9 @@ def download(url):
|
||||
return False
|
||||
|
||||
|
||||
isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[-][0-9])")
|
||||
|
||||
|
||||
def findISBN(src):
|
||||
"""Search for a valid ISBN in src.
|
||||
|
||||
@ -59,8 +62,7 @@ def findISBN(src):
|
||||
tools.warning(extractfull[1])
|
||||
return False
|
||||
extractfull = extractfull[0]
|
||||
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
|
||||
extractfull.lower().replace('Œ', '-'))
|
||||
extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-'))
|
||||
cleanISBN = False
|
||||
# Clean ISBN is the ISBN number without separators
|
||||
if extractISBN:
|
||||
@ -78,6 +80,15 @@ def isbn2Bib(isbn):
|
||||
return ''
|
||||
|
||||
|
||||
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]')
|
||||
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+')
|
||||
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}')
|
||||
clean_doi_re = re.compile('^/')
|
||||
clean_doi_fabse_re = re.compile('^10.1096')
|
||||
clean_doi_jcb_re = re.compile('^10.1083')
|
||||
clean_doi_len_re = re.compile(r'\d\.\d')
|
||||
|
||||
|
||||
def findDOI(src):
|
||||
"""Search for a valid DOI in src.
|
||||
|
||||
@ -99,29 +110,28 @@ def findDOI(src):
|
||||
tools.warning(extractfull[1])
|
||||
return False
|
||||
extractfull = extractfull[0]
|
||||
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
|
||||
extractfull.lower().replace('Œ', '-'))
|
||||
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))
|
||||
if not extractDOI:
|
||||
# PNAS fix
|
||||
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
|
||||
extractfull.lower().replace('pnas', '/pnas'))
|
||||
extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas',
|
||||
'/pnas'))
|
||||
if not extractDOI:
|
||||
# JSB fix
|
||||
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
|
||||
extractDOI = doi_jsb_re.search(extractfull.lower())
|
||||
|
||||
cleanDOI = False
|
||||
if extractDOI:
|
||||
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
||||
if re.search('^/', cleanDOI):
|
||||
if clean_doi_re.search(cleanDOI):
|
||||
cleanDOI = cleanDOI[1:]
|
||||
# FABSE J fix
|
||||
if re.search('^10.1096', cleanDOI):
|
||||
if clean_doi_fabse_re.search(cleanDOI):
|
||||
cleanDOI = cleanDOI[:20]
|
||||
# Second JCB fix
|
||||
if re.search('^10.1083', cleanDOI):
|
||||
if clean_doi_jcb_re.search(cleanDOI):
|
||||
cleanDOI = cleanDOI[:21]
|
||||
if len(cleanDOI) > 40:
|
||||
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
|
||||
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
|
||||
reps = {'.': 'A', '-': '0'}
|
||||
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
||||
digitStart = 0
|
||||
|
Loading…
Reference in New Issue
Block a user