From cce4fd50bb08924a056ff71fe839232c3d8071bf Mon Sep 17 00:00:00 2001 From: Phyks Date: Mon, 28 Apr 2014 22:23:05 +0200 Subject: [PATCH] Beginning of refactor --- README.md | 2 + fetcher.py | 146 +++++++++++++++++++++++++++++---- main.py | 227 ++++++--------------------------------------------- tearpages.py | 7 +- 4 files changed, 157 insertions(+), 225 deletions(-) mode change 100755 => 100644 fetcher.py diff --git a/README.md b/README.md index 0f70a49..d0fdddb 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,8 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really * Split main.py * Categories * Edit an entry instead of deleting it and adding it again +* Doc / Man +* No DOI for arXiv / HAL ## Issues ? diff --git a/fetcher.py b/fetcher.py old mode 100755 new mode 100644 index 79dd0bf..b0a85df --- a/fetcher.py +++ b/fetcher.py @@ -1,30 +1,26 @@ -#!/usr/bin/python2 -u +#!/usr/bin/env python2 # coding=utf8 -""" -Fetches papers. -""" - -from __future__ import print_function -import sys -import requesocks as requests +import isbntools +import re +import requesocks as requests # Requesocks is requests with SOCKS support +import subprocess +import tools import params -def warning(*objs): - """ - Write to stderr - """ - print("WARNING: ", *objs, file=sys.stderr) +def download(url): + """Download url tofile - -def download_url(url): + Check that it is a valid pdf or djvu file. Tries all the + available proxies sequentially. Returns the raw content of the file, or + false if it could not be downloaded. + """ for proxy in params.proxies: r_proxy = { "http": proxy, "https": proxy, } - try: r = requests.get(url, proxies=r_proxy) contenttype = False @@ -37,8 +33,122 @@ def download_url(url): continue return r.content, contenttype + # TODO : except trop large except: - warning("Proxy "+proxy+" not available.") + tools.warning("Proxy "+proxy+" not available.") continue - return False + + +def findISBN(src): + """Search for a valid ISBN in src. + + Returns the ISBN or false if not found or an error occurred.""" + if src.endswith(".pdf"): + totext = subprocess.Popen(["pdftotext", src, "-"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + elif src.endswith(".djvu"): + totext = subprocess.Popen(["djvutxt", src], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + extractfull = totext.communicate() + # TODO : ^ Return result before processing the whole book ? + if extractfull[1] is not "": + # Error happened + tools.warning(extractfull[1]) + return False + extractfull = extractfull[0] + extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])", + extractfull.lower().replace('Œ', '-')) + cleanISBN = False + # Clean ISBN is the ISBN number without separators + if extractISBN: + cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '') + return cleanISBN + + +def isbn2Bib(isbn): + """Try to get bibtex entry from an ISBN number""" + try: + # Default merges results from worldcat.org and google books + return isbntools.dev.fmt.fmtbib('bibtex', + isbntools.meta(isbn, 'default')) + except: + return '' + + +def findDOI(src): + """Search for a valid DOI in src. + + Returns the DOI or False if not found or an error occurred. + From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ + """ + if src.endswith(".pdf"): + totext = subprocess.Popen(["pdftotext", src, "-"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + elif src.endswith(".djvu"): + totext = subprocess.Popen(["djvutxt", src], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + extractfull = totext.communicate() + # TODO : ^ Return result before full conversion ? + if extractfull[1] is not "": + # Error happened + tools.warning(extractfull[1]) + return False + extractfull = extractfull[0] + extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', + extractfull.lower().replace('Œ', '-')) + if not extractDOI: + # PNAS fix + extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+', + extractfull.lower().replace('pnas', '/pnas')) + if not extractDOI: + # JSB fix + extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower()) + + cleanDOI = False + if extractDOI: + cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') + if re.search('^/', cleanDOI): + cleanDOI = cleanDOI[1:] + # FABSE J fix + if re.search('^10.1096', cleanDOI): + cleanDOI = cleanDOI[:20] + # Second JCB fix + if re.search('^10.1083', cleanDOI): + cleanDOI = cleanDOI[:21] + if len(cleanDOI) > 40: + cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI) + reps = {'.': 'A', '-': '0'} + cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps) + digitStart = 0 + for i in range(len(cleanDOItemp)): + if cleanDOItemp[i].isdigit(): + digitStart = 1 + if cleanDOItemp[i].isalpha() and digitStart: + break + cleanDOI = cleanDOI[0:(8+i)] + return cleanDOI + + +def doi2Bib(doi): + """Return a bibTeX string of metadata for a given DOI. + + From : https://gist.github.com/jrsmith3/5513926 + """ + url = "http://dx.doi.org/" + doi + headers = {"accept": "application/x-bibtex"} + try: + r = requests.get(url, headers=headers) + + if r.headers['content-type'] == 'application/x-bibtex': + return r.text + else: + return '' + except requests.exceptions.ConnectionError: + tools.warning('Unable to contact remote server to get the bibtex ' + + 'entry for doi '+doi) + return '' diff --git a/main.py b/main.py index c240631..4f37949 100755 --- a/main.py +++ b/main.py @@ -1,44 +1,22 @@ #!/usr/bin/env python2 # -*- coding: utf8 -*- -from __future__ import print_function - +import tools import fetcher +import backend import tearpages import sys import shutil import tempfile -import requests import subprocess -import re import os -from isbntools import meta -from isbntools.dev.fmt import fmtbib -try: - from cStringIO import StringIO -except: - from StringIO import StringIO from bibtexparser.bparser import BibTexParser from bibtexparser.customization import homogeneize_latex_encoding -from bibtexparser.bwriter import bibtex as bibTexWriter -from termios import tcflush, TCIOFLUSH import params EDITOR = os.environ.get('EDITOR') if os.environ.get('EDITOR') else 'vim' -def rawInput(string): - tcflush(sys.stdin, TCIOFLUSH) - return raw_input(string) - - -def warning(*objs): - """ - Write to stderr - """ - print("WARNING: ", *objs, file=sys.stderr) - - def parsed2Bibtex(parsed): """ Convert a single bibtex entry dict to bibtex string @@ -51,165 +29,6 @@ def parsed2Bibtex(parsed): return bibtex -def bibtexAppend(data): - """ - Append data to the main bibtex file - data is a dict for one entry in bibtex, as the one from bibtexparser output - """ - with open(params.folder+'index.bib', 'a') as fh: - fh.write(parsed2Bibtex(data)+"\n") - - -def bibtexRewrite(data): - """ - Rewrite the bibtex index file. - data is a dict of bibtex entry dict. - """ - bibtex = '' - for entry in data.keys(): - bibtex += parsed2Bibtex(data[entry])+"\n" - with open(params.folder+'index.bib', 'w') as fh: - fh.write(bibtex) - - -def replaceAll(text, dic): - for i, j in dic.iteritems(): - text = text.replace(i, j) - return text - - -def findISBN(src): - if src.endswith(".pdf"): - totext = subprocess.Popen(["pdftotext", src, "-"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - elif src.endswith(".djvu"): - totext = subprocess.Popen(["djvutxt", src], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - extractfull = totext.communicate() - if extractfull[1] is not "": - return False - - extractfull = extractfull[0] - extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])", - extractfull.lower().replace('Œ', '-')) - - cleanISBN = False - if extractISBN: - cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '') - - return cleanISBN - - -def isbn2Bib(isbn): - try: - return fmtbib('bibtex', meta(isbn, 'default')) - except: - return '' - - -def findDOI(src): - if src.endswith(".pdf"): - totext = subprocess.Popen(["pdftotext", src, "-"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - elif src.endswith(".djvu"): - totext = subprocess.Popen(["djvutxt", src], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - extractfull = totext.communicate() - if extractfull[1] is not "": - return False - - extractfull = extractfull[0] - extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', - extractfull.lower().replace('Œ', '-')) - if not extractDOI: - # PNAS fix - extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+', - extractfull.lower().replace('pnas', '/pnas')) - if not extractDOI: - # JSB fix - extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower()) - - cleanDOI = False - if extractDOI: - cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') - if re.search('^/', cleanDOI): - cleanDOI = cleanDOI[1:] - - # FABSE J fix - if re.search('^10.1096', cleanDOI): - cleanDOI = cleanDOI[:20] - - # Second JCB fix - if re.search('^10.1083', cleanDOI): - cleanDOI = cleanDOI[:21] - - if len(cleanDOI) > 40: - cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI) - reps = {'.': 'A', '-': '0'} - cleanDOItemp = replaceAll(cleanDOItemp[8:], reps) - digitStart = 0 - for i in range(len(cleanDOItemp)): - if cleanDOItemp[i].isdigit(): - digitStart = 1 - if cleanDOItemp[i].isalpha() and digitStart: - break - cleanDOI = cleanDOI[0:(8+i)] - - return cleanDOI - - -def doi2Bib(doi): - """ - Return a bibTeX string of metadata for a given DOI. - From : https://gist.github.com/jrsmith3/5513926 - """ - url = "http://dx.doi.org/" + doi - headers = {"accept": "application/x-bibtex"} - try: - r = requests.get(url, headers=headers) - - if r.headers['content-type'] == 'application/x-bibtex': - return r.text - else: - return '' - except requests.exceptions.ConnectionError: - warning('Unable to contact remote server to get the bibtex entry ' + - 'for doi '+doi) - return '' - - -_slugify_strip_re = re.compile(r'[^\w\s-]') -_slugify_hyphenate_re = re.compile(r'[\s]+') - - -def _slugify(value): - """ - Normalizes string, converts to lowercase, removes non-alpha characters, - and converts spaces to hyphens. - - From Django's "django/template/defaultfilters.py". - """ - import unicodedata - if not isinstance(value, unicode): - value = unicode(value) - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') - value = unicode(_slugify_strip_re.sub('', value).strip()) - return _slugify_hyphenate_re.sub('_', value) - - -def getExtension(filename): - """ - Get the extension of the filename - """ - return filename[filename.rfind('.'):] - - def checkBibtex(filename, bibtex): print("The bibtex entry found for "+filename+" is :") @@ -222,7 +41,7 @@ def checkBibtex(filename, bibtex): else: bibtex_string = '' print(bibtex_string) - check = rawInput("Is it correct ? [Y/n] ") + check = tools.rawInput("Is it correct ? [Y/n] ") while check.lower() == 'n': with tempfile.NamedTemporaryFile(suffix=".tmp") as tmpfile: @@ -240,7 +59,7 @@ def checkBibtex(filename, bibtex): bibtex_string = '' print("\nThe bibtex entry for "+filename+" is :") print(bibtex_string) - check = rawInput("Is it correct ? [Y/n] ") + check = tools.rawInput("Is it correct ? [Y/n] ") return bibtex @@ -256,23 +75,23 @@ def addFile(src, filetype): if doi is False and isbn is False: if filetype is None: - warning("Could not determine the DOI or the ISBN for "+src+"." + + tools.warning("Could not determine the DOI or the ISBN for "+src+"." + "Switching to manual entry.") doi_isbn = '' while doi_isbn not in ['doi', 'isbn']: - doi_isbn = rawInput("DOI / ISBN ? ").lower() + doi_isbn = tools.rawInput("DOI / ISBN ? ").lower() if doi_isbn == 'doi': - doi = rawInput('DOI ? ') + doi = tools.rawInput('DOI ? ') else: - isbn = rawInput('ISBN ? ') + isbn = tools.rawInput('ISBN ? ') elif filetype == 'article': - warning("Could not determine the DOI for "+src + + tools.warning("Could not determine the DOI for "+src + ", switching to manual entry.") - doi = rawInput('DOI ? ') + doi = tools.rawInput('DOI ? ') elif filetype == 'book': - warning("Could not determine the ISBN for "+src + + tools.warning("Could not determine the ISBN for "+src + ", switching to manual entry.") - isbn = rawInput('ISBN ? ') + isbn = tools.rawInput('ISBN ? ') elif doi is not False: print("DOI for "+src+" is "+doi+".") elif isbn is not False: @@ -310,13 +129,13 @@ def addFile(src, filetype): new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip() for i in authors])) - new_name = params.folder+_slugify(new_name)+getExtension(src) + new_name = params.folder+tools.slugify(new_name)+tools.getExtension(src) while os.path.exists(new_name): - warning("file "+new_name+" already exists.") - default_rename = new_name.replace(getExtension(new_name), - " (2)"+getExtension(new_name)) - rename = rawInput("New name ["+default_rename+"] ? ") + tools.warning("file "+new_name+" already exists.") + default_rename = new_name.replace(tools.getExtension(new_name), + " (2)"+tools.getExtension(new_name)) + rename = tools.rawInput("New name ["+default_rename+"] ? ") if rename == '': new_name = default_rename else: @@ -351,7 +170,7 @@ def deleteId(ident): try: os.remove(bibtex[ident]['file']) except: - warning("Unable to delete file associated to id "+ident+" : " + + tools.warning("Unable to delete file associated to id "+ident+" : " + bibtex[ident]['file']) del(bibtex[ident]) bibtexRewrite(bibtex) @@ -373,7 +192,7 @@ def deleteFile(filename): try: os.remove(bibtex[key]['file']) except: - warning("Unable to delete file associated to id "+key+" : " + + tools.warning("Unable to delete file associated to id "+key+" : " + bibtex[key]['file']) del(bibtex[key]) if found: @@ -382,7 +201,7 @@ def deleteFile(filename): def downloadFile(url, filetype): - dl, contenttype = fetcher.download_url(url) + dl, contenttype = fetcher.download(url) if dl is not False: tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype) @@ -393,7 +212,7 @@ def downloadFile(url, filetype): tmp.close() return new_name else: - warning("Could not fetch "+url) + tools.warning("Could not fetch "+url) return False @@ -434,13 +253,13 @@ if __name__ == '__main__': if len(sys.argv) < 3: sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID") - confirm = rawInput("Are you sure you want to delete "+sys.argv[2] + + confirm = tools.rawInput("Are you sure you want to delete "+sys.argv[2] + " ? [y/N] ") if confirm.lower() == 'y': if not deleteId(sys.argv[2]): if not deleteFile(sys.argv[2]): - warning("Unable to delete "+sys.argv[2]) + tools.warning("Unable to delete "+sys.argv[2]) sys.exit(1) print(sys.argv[2]+" successfully deleted.") diff --git a/tearpages.py b/tearpages.py index 10e9e6a..37b75c1 100644 --- a/tearpages.py +++ b/tearpages.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 # -*- coding: utf-8 -*- # Author: Francois Boulogne # License: GPLv3 @@ -30,6 +30,7 @@ def fixPdf(pdfFile, destination): output.close() shutil.copy(tmp.name, destination) + def tearpage(filename): """ Copy filename to a tempfile, write pages 1..N to filename. @@ -60,8 +61,8 @@ def tearpage(filename): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Remove the first page of a PDF', - epilog='') + parser = argparse.ArgumentParser(description='Remove the first page ' + + 'of a PDF', epilog='') parser.add_argument('--version', action='version', version=__version__) parser.add_argument('pdf', metavar='PDF', help='PDF filepath') args = parser.parse_args()