Bugfixes in fetcher.py + function to find hal id

This commit is contained in:
Phyks 2014-05-02 00:33:09 +02:00
parent 289c7dece4
commit 787113db66

View File

@ -58,8 +58,11 @@ def findISBN(src):
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
bufsize=1) bufsize=1)
else:
return False
while totext.poll() is None: while totext.poll() is None:
extractfull = totext.stdin.readline() extractfull = totext.stdout.readline()
extractISBN = isbn_re.search(extractfull.lower().replace('Œ', extractISBN = isbn_re.search(extractfull.lower().replace('Œ',
'-')) '-'))
if extractISBN: if extractISBN:
@ -112,9 +115,11 @@ def findDOI(src):
totext = subprocess.Popen(["djvutxt", src], totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
else:
return False
while totext.poll() is None: while totext.poll() is None:
extractfull = totext.stdin.readline() extractfull = totext.stdout.readline()
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))
if not extractDOI: if not extractDOI:
# PNAS fix # PNAS fix
@ -182,7 +187,7 @@ arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)')
def findArXivId(src): def findArXivId(src):
"""Search for a valid arXiv id in src. """Searches for a valid arXiv id in src.
Returns the arXiv id or False if not found or an error occurred. Returns the arXiv id or False if not found or an error occurred.
From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
@ -195,9 +200,11 @@ def findArXivId(src):
totext = subprocess.Popen(["djvutxt", src], totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
else:
return False
while totext.poll() is None: while totext.poll() is None:
extractfull = totext.stdin.readline() extractfull = totext.stdout.readline()
extractID = arXiv_re.search(extractfull) extractID = arXiv_re.search(extractfull)
if extractID: if extractID:
totext.terminate() totext.terminate()
@ -209,7 +216,7 @@ def findArXivId(src):
tools.warning(err) tools.warning(err)
return False return False
else: else:
return extractID return extractID.group(1)
def arXiv2Bib(arxiv): def arXiv2Bib(arxiv):
@ -224,3 +231,39 @@ def arXiv2Bib(arxiv):
else: else:
return bib.bibtex() return bib.bibtex()
return False return False
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
def findHALId(src):
"""Searches for a valid HAL id in src
Returns a tuple of the HAL id and the version
or False if not found or an error occurred.
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
while totext.poll() is None:
extractfull = totext.stdout.readline()
extractID = HAL_re.search(extractfull)
if extractID:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
else:
return extractID.group(1), extractID.group(2)