Beginning of refactor

This commit is contained in:
Phyks 2014-04-28 22:23:05 +02:00
parent 613f082471
commit cce4fd50bb
4 changed files with 157 additions and 225 deletions

View File

@ -115,6 +115,8 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really
* Split main.py * Split main.py
* Categories * Categories
* Edit an entry instead of deleting it and adding it again * Edit an entry instead of deleting it and adding it again
* Doc / Man
* No DOI for arXiv / HAL
## Issues ? ## Issues ?

146
fetcher.py Executable file → Normal file
View File

@ -1,30 +1,26 @@
#!/usr/bin/python2 -u #!/usr/bin/env python2
# coding=utf8 # coding=utf8
""" import isbntools
Fetches papers. import re
""" import requesocks as requests # Requesocks is requests with SOCKS support
import subprocess
from __future__ import print_function import tools
import sys
import requesocks as requests
import params import params
def warning(*objs): def download(url):
""" """Download url tofile
Write to stderr
"""
print("WARNING: ", *objs, file=sys.stderr)
Check that it is a valid pdf or djvu file. Tries all the
def download_url(url): available proxies sequentially. Returns the raw content of the file, or
false if it could not be downloaded.
"""
for proxy in params.proxies: for proxy in params.proxies:
r_proxy = { r_proxy = {
"http": proxy, "http": proxy,
"https": proxy, "https": proxy,
} }
try: try:
r = requests.get(url, proxies=r_proxy) r = requests.get(url, proxies=r_proxy)
contenttype = False contenttype = False
@ -37,8 +33,122 @@ def download_url(url):
continue continue
return r.content, contenttype return r.content, contenttype
# TODO : except trop large
except: except:
warning("Proxy "+proxy+" not available.") tools.warning("Proxy "+proxy+" not available.")
continue continue
return False return False
def findISBN(src):
"""Search for a valid ISBN in src.
Returns the ISBN or false if not found or an error occurred."""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
extractfull = totext.communicate()
# TODO : ^ Return result before processing the whole book ?
if extractfull[1] is not "":
# Error happened
tools.warning(extractfull[1])
return False
extractfull = extractfull[0]
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
extractfull.lower().replace('Œ', '-'))
cleanISBN = False
# Clean ISBN is the ISBN number without separators
if extractISBN:
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
return cleanISBN
def isbn2Bib(isbn):
"""Try to get bibtex entry from an ISBN number"""
try:
# Default merges results from worldcat.org and google books
return isbntools.dev.fmt.fmtbib('bibtex',
isbntools.meta(isbn, 'default'))
except:
return ''
def findDOI(src):
"""Search for a valid DOI in src.
Returns the DOI or False if not found or an error occurred.
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
extractfull = totext.communicate()
# TODO : ^ Return result before full conversion ?
if extractfull[1] is not "":
# Error happened
tools.warning(extractfull[1])
return False
extractfull = extractfull[0]
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
extractfull.lower().replace('&#338;', '-'))
if not extractDOI:
# PNAS fix
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
extractfull.lower().replace('pnas', '/pnas'))
if not extractDOI:
# JSB fix
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
cleanDOI = False
if extractDOI:
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
if re.search('^/', cleanDOI):
cleanDOI = cleanDOI[1:]
# FABSE J fix
if re.search('^10.1096', cleanDOI):
cleanDOI = cleanDOI[:20]
# Second JCB fix
if re.search('^10.1083', cleanDOI):
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
reps = {'.': 'A', '-': '0'}
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return cleanDOI
def doi2Bib(doi):
"""Return a bibTeX string of metadata for a given DOI.
From : https://gist.github.com/jrsmith3/5513926
"""
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
try:
r = requests.get(url, headers=headers)
if r.headers['content-type'] == 'application/x-bibtex':
return r.text
else:
return ''
except requests.exceptions.ConnectionError:
tools.warning('Unable to contact remote server to get the bibtex ' +
'entry for doi '+doi)
return ''

227
main.py
View File

@ -1,44 +1,22 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf8 -*- # -*- coding: utf8 -*-
from __future__ import print_function import tools
import fetcher import fetcher
import backend
import tearpages import tearpages
import sys import sys
import shutil import shutil
import tempfile import tempfile
import requests
import subprocess import subprocess
import re
import os import os
from isbntools import meta
from isbntools.dev.fmt import fmtbib
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
from bibtexparser.bparser import BibTexParser from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import homogeneize_latex_encoding from bibtexparser.customization import homogeneize_latex_encoding
from bibtexparser.bwriter import bibtex as bibTexWriter
from termios import tcflush, TCIOFLUSH
import params import params
EDITOR = os.environ.get('EDITOR') if os.environ.get('EDITOR') else 'vim' EDITOR = os.environ.get('EDITOR') if os.environ.get('EDITOR') else 'vim'
def rawInput(string):
tcflush(sys.stdin, TCIOFLUSH)
return raw_input(string)
def warning(*objs):
"""
Write to stderr
"""
print("WARNING: ", *objs, file=sys.stderr)
def parsed2Bibtex(parsed): def parsed2Bibtex(parsed):
""" """
Convert a single bibtex entry dict to bibtex string Convert a single bibtex entry dict to bibtex string
@ -51,165 +29,6 @@ def parsed2Bibtex(parsed):
return bibtex return bibtex
def bibtexAppend(data):
"""
Append data to the main bibtex file
data is a dict for one entry in bibtex, as the one from bibtexparser output
"""
with open(params.folder+'index.bib', 'a') as fh:
fh.write(parsed2Bibtex(data)+"\n")
def bibtexRewrite(data):
"""
Rewrite the bibtex index file.
data is a dict of bibtex entry dict.
"""
bibtex = ''
for entry in data.keys():
bibtex += parsed2Bibtex(data[entry])+"\n"
with open(params.folder+'index.bib', 'w') as fh:
fh.write(bibtex)
def replaceAll(text, dic):
for i, j in dic.iteritems():
text = text.replace(i, j)
return text
def findISBN(src):
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
extractfull = totext.communicate()
if extractfull[1] is not "":
return False
extractfull = extractfull[0]
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
extractfull.lower().replace('&#338;', '-'))
cleanISBN = False
if extractISBN:
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
return cleanISBN
def isbn2Bib(isbn):
try:
return fmtbib('bibtex', meta(isbn, 'default'))
except:
return ''
def findDOI(src):
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
extractfull = totext.communicate()
if extractfull[1] is not "":
return False
extractfull = extractfull[0]
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
extractfull.lower().replace('&#338;', '-'))
if not extractDOI:
# PNAS fix
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
extractfull.lower().replace('pnas', '/pnas'))
if not extractDOI:
# JSB fix
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
cleanDOI = False
if extractDOI:
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
if re.search('^/', cleanDOI):
cleanDOI = cleanDOI[1:]
# FABSE J fix
if re.search('^10.1096', cleanDOI):
cleanDOI = cleanDOI[:20]
# Second JCB fix
if re.search('^10.1083', cleanDOI):
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
reps = {'.': 'A', '-': '0'}
cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return cleanDOI
def doi2Bib(doi):
"""
Return a bibTeX string of metadata for a given DOI.
From : https://gist.github.com/jrsmith3/5513926
"""
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
try:
r = requests.get(url, headers=headers)
if r.headers['content-type'] == 'application/x-bibtex':
return r.text
else:
return ''
except requests.exceptions.ConnectionError:
warning('Unable to contact remote server to get the bibtex entry ' +
'for doi '+doi)
return ''
_slugify_strip_re = re.compile(r'[^\w\s-]')
_slugify_hyphenate_re = re.compile(r'[\s]+')
def _slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
From Django's "django/template/defaultfilters.py".
"""
import unicodedata
if not isinstance(value, unicode):
value = unicode(value)
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(_slugify_strip_re.sub('', value).strip())
return _slugify_hyphenate_re.sub('_', value)
def getExtension(filename):
"""
Get the extension of the filename
"""
return filename[filename.rfind('.'):]
def checkBibtex(filename, bibtex): def checkBibtex(filename, bibtex):
print("The bibtex entry found for "+filename+" is :") print("The bibtex entry found for "+filename+" is :")
@ -222,7 +41,7 @@ def checkBibtex(filename, bibtex):
else: else:
bibtex_string = '' bibtex_string = ''
print(bibtex_string) print(bibtex_string)
check = rawInput("Is it correct ? [Y/n] ") check = tools.rawInput("Is it correct ? [Y/n] ")
while check.lower() == 'n': while check.lower() == 'n':
with tempfile.NamedTemporaryFile(suffix=".tmp") as tmpfile: with tempfile.NamedTemporaryFile(suffix=".tmp") as tmpfile:
@ -240,7 +59,7 @@ def checkBibtex(filename, bibtex):
bibtex_string = '' bibtex_string = ''
print("\nThe bibtex entry for "+filename+" is :") print("\nThe bibtex entry for "+filename+" is :")
print(bibtex_string) print(bibtex_string)
check = rawInput("Is it correct ? [Y/n] ") check = tools.rawInput("Is it correct ? [Y/n] ")
return bibtex return bibtex
@ -256,23 +75,23 @@ def addFile(src, filetype):
if doi is False and isbn is False: if doi is False and isbn is False:
if filetype is None: if filetype is None:
warning("Could not determine the DOI or the ISBN for "+src+"." + tools.warning("Could not determine the DOI or the ISBN for "+src+"." +
"Switching to manual entry.") "Switching to manual entry.")
doi_isbn = '' doi_isbn = ''
while doi_isbn not in ['doi', 'isbn']: while doi_isbn not in ['doi', 'isbn']:
doi_isbn = rawInput("DOI / ISBN ? ").lower() doi_isbn = tools.rawInput("DOI / ISBN ? ").lower()
if doi_isbn == 'doi': if doi_isbn == 'doi':
doi = rawInput('DOI ? ') doi = tools.rawInput('DOI ? ')
else: else:
isbn = rawInput('ISBN ? ') isbn = tools.rawInput('ISBN ? ')
elif filetype == 'article': elif filetype == 'article':
warning("Could not determine the DOI for "+src + tools.warning("Could not determine the DOI for "+src +
", switching to manual entry.") ", switching to manual entry.")
doi = rawInput('DOI ? ') doi = tools.rawInput('DOI ? ')
elif filetype == 'book': elif filetype == 'book':
warning("Could not determine the ISBN for "+src + tools.warning("Could not determine the ISBN for "+src +
", switching to manual entry.") ", switching to manual entry.")
isbn = rawInput('ISBN ? ') isbn = tools.rawInput('ISBN ? ')
elif doi is not False: elif doi is not False:
print("DOI for "+src+" is "+doi+".") print("DOI for "+src+" is "+doi+".")
elif isbn is not False: elif isbn is not False:
@ -310,13 +129,13 @@ def addFile(src, filetype):
new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip() new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
for i in authors])) for i in authors]))
new_name = params.folder+_slugify(new_name)+getExtension(src) new_name = params.folder+tools.slugify(new_name)+tools.getExtension(src)
while os.path.exists(new_name): while os.path.exists(new_name):
warning("file "+new_name+" already exists.") tools.warning("file "+new_name+" already exists.")
default_rename = new_name.replace(getExtension(new_name), default_rename = new_name.replace(tools.getExtension(new_name),
" (2)"+getExtension(new_name)) " (2)"+tools.getExtension(new_name))
rename = rawInput("New name ["+default_rename+"] ? ") rename = tools.rawInput("New name ["+default_rename+"] ? ")
if rename == '': if rename == '':
new_name = default_rename new_name = default_rename
else: else:
@ -351,7 +170,7 @@ def deleteId(ident):
try: try:
os.remove(bibtex[ident]['file']) os.remove(bibtex[ident]['file'])
except: except:
warning("Unable to delete file associated to id "+ident+" : " + tools.warning("Unable to delete file associated to id "+ident+" : " +
bibtex[ident]['file']) bibtex[ident]['file'])
del(bibtex[ident]) del(bibtex[ident])
bibtexRewrite(bibtex) bibtexRewrite(bibtex)
@ -373,7 +192,7 @@ def deleteFile(filename):
try: try:
os.remove(bibtex[key]['file']) os.remove(bibtex[key]['file'])
except: except:
warning("Unable to delete file associated to id "+key+" : " + tools.warning("Unable to delete file associated to id "+key+" : " +
bibtex[key]['file']) bibtex[key]['file'])
del(bibtex[key]) del(bibtex[key])
if found: if found:
@ -382,7 +201,7 @@ def deleteFile(filename):
def downloadFile(url, filetype): def downloadFile(url, filetype):
dl, contenttype = fetcher.download_url(url) dl, contenttype = fetcher.download(url)
if dl is not False: if dl is not False:
tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype) tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)
@ -393,7 +212,7 @@ def downloadFile(url, filetype):
tmp.close() tmp.close()
return new_name return new_name
else: else:
warning("Could not fetch "+url) tools.warning("Could not fetch "+url)
return False return False
@ -434,13 +253,13 @@ if __name__ == '__main__':
if len(sys.argv) < 3: if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID") sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
confirm = rawInput("Are you sure you want to delete "+sys.argv[2] + confirm = tools.rawInput("Are you sure you want to delete "+sys.argv[2] +
" ? [y/N] ") " ? [y/N] ")
if confirm.lower() == 'y': if confirm.lower() == 'y':
if not deleteId(sys.argv[2]): if not deleteId(sys.argv[2]):
if not deleteFile(sys.argv[2]): if not deleteFile(sys.argv[2]):
warning("Unable to delete "+sys.argv[2]) tools.warning("Unable to delete "+sys.argv[2])
sys.exit(1) sys.exit(1)
print(sys.argv[2]+" successfully deleted.") print(sys.argv[2]+" successfully deleted.")

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Author: Francois Boulogne # Author: Francois Boulogne
# License: GPLv3 # License: GPLv3
@ -30,6 +30,7 @@ def fixPdf(pdfFile, destination):
output.close() output.close()
shutil.copy(tmp.name, destination) shutil.copy(tmp.name, destination)
def tearpage(filename): def tearpage(filename):
""" """
Copy filename to a tempfile, write pages 1..N to filename. Copy filename to a tempfile, write pages 1..N to filename.
@ -60,8 +61,8 @@ def tearpage(filename):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Remove the first page of a PDF', parser = argparse.ArgumentParser(description='Remove the first page ' +
epilog='') 'of a PDF', epilog='')
parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('pdf', metavar='PDF', help='PDF filepath') parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
args = parser.parse_args() args = parser.parse_args()