From 787113db665b8f963d5b22e62a5b0ecd9c597b33 Mon Sep 17 00:00:00 2001 From: Phyks Date: Fri, 2 May 2014 00:33:09 +0200 Subject: [PATCH] Bugfixes in fetcher.py + function to find hal id --- fetcher.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/fetcher.py b/fetcher.py index df67f14..077ee1b 100644 --- a/fetcher.py +++ b/fetcher.py @@ -58,8 +58,11 @@ def findISBN(src): stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) + else: + return False + while totext.poll() is None: - extractfull = totext.stdin.readline() + extractfull = totext.stdout.readline() extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-')) if extractISBN: @@ -112,9 +115,11 @@ def findDOI(src): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + else: + return False while totext.poll() is None: - extractfull = totext.stdin.readline() + extractfull = totext.stdout.readline() extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) if not extractDOI: # PNAS fix @@ -182,7 +187,7 @@ arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)') def findArXivId(src): - """Search for a valid arXiv id in src. + """Searches for a valid arXiv id in src. Returns the arXiv id or False if not found or an error occurred. From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb @@ -195,9 +200,11 @@ def findArXivId(src): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + else: + return False while totext.poll() is None: - extractfull = totext.stdin.readline() + extractfull = totext.stdout.readline() extractID = arXiv_re.search(extractfull) if extractID: totext.terminate() @@ -209,7 +216,7 @@ def findArXivId(src): tools.warning(err) return False else: - return extractID + return extractID.group(1) def arXiv2Bib(arxiv): @@ -224,3 +231,39 @@ def arXiv2Bib(arxiv): else: return bib.bibtex() return False + + +HAL_re = re.compile(r'(hal-\d{8}), version (\d+)') + + +def findHALId(src): + """Searches for a valid HAL id in src + + Returns a tuple of the HAL id and the version + or False if not found or an error occurred. + """ + if src.endswith(".pdf"): + totext = subprocess.Popen(["pdftotext", src, "-"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + elif src.endswith(".djvu"): + totext = subprocess.Popen(["djvutxt", src], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + else: + return False + + while totext.poll() is None: + extractfull = totext.stdout.readline() + extractID = HAL_re.search(extractfull) + if extractID: + totext.terminate() + break + + err = totext.communicate()[1] + if totext.returncode > 0: + # Error happened + tools.warning(err) + return False + else: + return extractID.group(1), extractID.group(2)