Various re.compile
This commit is contained in:
parent
7d36f3206f
commit
91685bc46b
@ -108,8 +108,6 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really
|
|||||||
10. Refactor
|
10. Refactor
|
||||||
11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex
|
11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex
|
||||||
12. Rebuild function
|
12. Rebuild function
|
||||||
13. Split main.py
|
|
||||||
14. Various re.compile ?
|
|
||||||
15. Check output of subprocesses before it ends
|
15. Check output of subprocesses before it ends
|
||||||
16. TODO in files
|
16. TODO in files
|
||||||
20. No DOI for arXiv / HAL
|
20. No DOI for arXiv / HAL
|
||||||
|
32
fetcher.py
32
fetcher.py
@ -40,6 +40,9 @@ def download(url):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[-][0-9])")
|
||||||
|
|
||||||
|
|
||||||
def findISBN(src):
|
def findISBN(src):
|
||||||
"""Search for a valid ISBN in src.
|
"""Search for a valid ISBN in src.
|
||||||
|
|
||||||
@ -59,8 +62,7 @@ def findISBN(src):
|
|||||||
tools.warning(extractfull[1])
|
tools.warning(extractfull[1])
|
||||||
return False
|
return False
|
||||||
extractfull = extractfull[0]
|
extractfull = extractfull[0]
|
||||||
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
|
extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-'))
|
||||||
extractfull.lower().replace('Œ', '-'))
|
|
||||||
cleanISBN = False
|
cleanISBN = False
|
||||||
# Clean ISBN is the ISBN number without separators
|
# Clean ISBN is the ISBN number without separators
|
||||||
if extractISBN:
|
if extractISBN:
|
||||||
@ -78,6 +80,15 @@ def isbn2Bib(isbn):
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]')
|
||||||
|
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+')
|
||||||
|
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}')
|
||||||
|
clean_doi_re = re.compile('^/')
|
||||||
|
clean_doi_fabse_re = re.compile('^10.1096')
|
||||||
|
clean_doi_jcb_re = re.compile('^10.1083')
|
||||||
|
clean_doi_len_re = re.compile(r'\d\.\d')
|
||||||
|
|
||||||
|
|
||||||
def findDOI(src):
|
def findDOI(src):
|
||||||
"""Search for a valid DOI in src.
|
"""Search for a valid DOI in src.
|
||||||
|
|
||||||
@ -99,29 +110,28 @@ def findDOI(src):
|
|||||||
tools.warning(extractfull[1])
|
tools.warning(extractfull[1])
|
||||||
return False
|
return False
|
||||||
extractfull = extractfull[0]
|
extractfull = extractfull[0]
|
||||||
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
|
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))
|
||||||
extractfull.lower().replace('Œ', '-'))
|
|
||||||
if not extractDOI:
|
if not extractDOI:
|
||||||
# PNAS fix
|
# PNAS fix
|
||||||
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
|
extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas',
|
||||||
extractfull.lower().replace('pnas', '/pnas'))
|
'/pnas'))
|
||||||
if not extractDOI:
|
if not extractDOI:
|
||||||
# JSB fix
|
# JSB fix
|
||||||
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
|
extractDOI = doi_jsb_re.search(extractfull.lower())
|
||||||
|
|
||||||
cleanDOI = False
|
cleanDOI = False
|
||||||
if extractDOI:
|
if extractDOI:
|
||||||
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
||||||
if re.search('^/', cleanDOI):
|
if clean_doi_re.search(cleanDOI):
|
||||||
cleanDOI = cleanDOI[1:]
|
cleanDOI = cleanDOI[1:]
|
||||||
# FABSE J fix
|
# FABSE J fix
|
||||||
if re.search('^10.1096', cleanDOI):
|
if clean_doi_fabse_re.search(cleanDOI):
|
||||||
cleanDOI = cleanDOI[:20]
|
cleanDOI = cleanDOI[:20]
|
||||||
# Second JCB fix
|
# Second JCB fix
|
||||||
if re.search('^10.1083', cleanDOI):
|
if clean_doi_jcb_re.search(cleanDOI):
|
||||||
cleanDOI = cleanDOI[:21]
|
cleanDOI = cleanDOI[:21]
|
||||||
if len(cleanDOI) > 40:
|
if len(cleanDOI) > 40:
|
||||||
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
|
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
|
||||||
reps = {'.': 'A', '-': '0'}
|
reps = {'.': 'A', '-': '0'}
|
||||||
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
||||||
digitStart = 0
|
digitStart = 0
|
||||||
|
Loading…
Reference in New Issue
Block a user