From 3d07af0e7195312b1be49e397f1692210ec16f2c Mon Sep 17 00:00:00 2001 From: Phyks Date: Wed, 30 Apr 2014 00:36:15 +0200 Subject: [PATCH] Check output before processing the whole file for ISBN and DOI search --- README.md | 2 -- fetcher.py | 56 +++++++++++++++++++++++++++++++--------------------- tearpages.py | 2 +- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 4f2053d..aa90dc9 100644 --- a/README.md +++ b/README.md @@ -108,8 +108,6 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really 10. Refactor 11. Use bibtex-parser lib to write bibtex, instead of parsed2BibTex 12. Rebuild function - 15. Check output of subprocesses before it ends - 16. TODO in files 20. No DOI for arXiv / HAL 30. Parameter to disable remote search 40. Open file diff --git a/fetcher.py b/fetcher.py index 297a6c7..3e77bef 100644 --- a/fetcher.py +++ b/fetcher.py @@ -50,19 +50,26 @@ def findISBN(src): if src.endswith(".pdf"): totext = subprocess.Popen(["pdftotext", src, "-"], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + stderr=subprocess.PIPE, + bufsize=1) elif src.endswith(".djvu"): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - extractfull = totext.communicate() - # TODO : ^ Return result before processing the whole book ? - if extractfull[1] is not "": + stderr=subprocess.PIPE, + bufsize=1) + while totext.poll() == None: + extractfull = totext.stdin.readline() + extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-')) + if extractISBN: + totext.terminate() + break + + err = totext.communicate()[1] + if totext.returncode > 0: # Error happened - tools.warning(extractfull[1]) + tools.warning(err) return False - extractfull = extractfull[0] - extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-')) + cleanISBN = False # Clean ISBN is the ISBN number without separators if extractISBN: @@ -103,21 +110,26 @@ def findDOI(src): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - extractfull = totext.communicate() - # TODO : ^ Return result before full conversion ? - if extractfull[1] is not "": - # Error happened - tools.warning(extractfull[1]) - return False - extractfull = extractfull[0] - extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) - if not extractDOI: - # PNAS fix - extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas', - '/pnas')) + + while totext.poll() == None: + extractfull = totext.stdin.readline() + extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) if not extractDOI: - # JSB fix - extractDOI = doi_jsb_re.search(extractfull.lower()) + # PNAS fix + extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas', + '/pnas')) + if not extractDOI: + # JSB fix + extractDOI = doi_jsb_re.search(extractfull.lower()) + if extractDOI: + totext.terminate() + break + + err = totext.communicate()[1] + if totext.returncode > 0: + # Error happened + tools.warning(err) + return False cleanDOI = False if extractDOI: diff --git a/tearpages.py b/tearpages.py index 37b75c1..6f442d7 100644 --- a/tearpages.py +++ b/tearpages.py @@ -52,7 +52,7 @@ def tearpage(filename): # Write pages excepted the first one output_file = PdfFileWriter() - for i in range(0, num_pages): + for i in range(1, num_pages): output_file.addPage(input_file.getPage(i)) tmp.close()