Beginning of refactor
This commit is contained in:
parent
613f082471
commit
cce4fd50bb
@ -115,6 +115,8 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really
|
||||
* Split main.py
|
||||
* Categories
|
||||
* Edit an entry instead of deleting it and adding it again
|
||||
* Doc / Man
|
||||
* No DOI for arXiv / HAL
|
||||
|
||||
## Issues ?
|
||||
|
||||
|
146
fetcher.py
Executable file → Normal file
146
fetcher.py
Executable file → Normal file
@ -1,30 +1,26 @@
|
||||
#!/usr/bin/python2 -u
|
||||
#!/usr/bin/env python2
|
||||
# coding=utf8
|
||||
|
||||
"""
|
||||
Fetches papers.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import requesocks as requests
|
||||
import isbntools
|
||||
import re
|
||||
import requesocks as requests # Requesocks is requests with SOCKS support
|
||||
import subprocess
|
||||
import tools
|
||||
import params
|
||||
|
||||
|
||||
def warning(*objs):
|
||||
"""
|
||||
Write to stderr
|
||||
"""
|
||||
print("WARNING: ", *objs, file=sys.stderr)
|
||||
def download(url):
|
||||
"""Download url tofile
|
||||
|
||||
|
||||
def download_url(url):
|
||||
Check that it is a valid pdf or djvu file. Tries all the
|
||||
available proxies sequentially. Returns the raw content of the file, or
|
||||
false if it could not be downloaded.
|
||||
"""
|
||||
for proxy in params.proxies:
|
||||
r_proxy = {
|
||||
"http": proxy,
|
||||
"https": proxy,
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(url, proxies=r_proxy)
|
||||
contenttype = False
|
||||
@ -37,8 +33,122 @@ def download_url(url):
|
||||
continue
|
||||
|
||||
return r.content, contenttype
|
||||
# TODO : except trop large
|
||||
except:
|
||||
warning("Proxy "+proxy+" not available.")
|
||||
tools.warning("Proxy "+proxy+" not available.")
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def findISBN(src):
|
||||
"""Search for a valid ISBN in src.
|
||||
|
||||
Returns the ISBN or false if not found or an error occurred."""
|
||||
if src.endswith(".pdf"):
|
||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
elif src.endswith(".djvu"):
|
||||
totext = subprocess.Popen(["djvutxt", src],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
extractfull = totext.communicate()
|
||||
# TODO : ^ Return result before processing the whole book ?
|
||||
if extractfull[1] is not "":
|
||||
# Error happened
|
||||
tools.warning(extractfull[1])
|
||||
return False
|
||||
extractfull = extractfull[0]
|
||||
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
|
||||
extractfull.lower().replace('Œ', '-'))
|
||||
cleanISBN = False
|
||||
# Clean ISBN is the ISBN number without separators
|
||||
if extractISBN:
|
||||
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
|
||||
return cleanISBN
|
||||
|
||||
|
||||
def isbn2Bib(isbn):
|
||||
"""Try to get bibtex entry from an ISBN number"""
|
||||
try:
|
||||
# Default merges results from worldcat.org and google books
|
||||
return isbntools.dev.fmt.fmtbib('bibtex',
|
||||
isbntools.meta(isbn, 'default'))
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def findDOI(src):
|
||||
"""Search for a valid DOI in src.
|
||||
|
||||
Returns the DOI or False if not found or an error occurred.
|
||||
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
||||
"""
|
||||
if src.endswith(".pdf"):
|
||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
elif src.endswith(".djvu"):
|
||||
totext = subprocess.Popen(["djvutxt", src],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
extractfull = totext.communicate()
|
||||
# TODO : ^ Return result before full conversion ?
|
||||
if extractfull[1] is not "":
|
||||
# Error happened
|
||||
tools.warning(extractfull[1])
|
||||
return False
|
||||
extractfull = extractfull[0]
|
||||
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
|
||||
extractfull.lower().replace('Œ', '-'))
|
||||
if not extractDOI:
|
||||
# PNAS fix
|
||||
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
|
||||
extractfull.lower().replace('pnas', '/pnas'))
|
||||
if not extractDOI:
|
||||
# JSB fix
|
||||
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
|
||||
|
||||
cleanDOI = False
|
||||
if extractDOI:
|
||||
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
||||
if re.search('^/', cleanDOI):
|
||||
cleanDOI = cleanDOI[1:]
|
||||
# FABSE J fix
|
||||
if re.search('^10.1096', cleanDOI):
|
||||
cleanDOI = cleanDOI[:20]
|
||||
# Second JCB fix
|
||||
if re.search('^10.1083', cleanDOI):
|
||||
cleanDOI = cleanDOI[:21]
|
||||
if len(cleanDOI) > 40:
|
||||
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
|
||||
reps = {'.': 'A', '-': '0'}
|
||||
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
||||
digitStart = 0
|
||||
for i in range(len(cleanDOItemp)):
|
||||
if cleanDOItemp[i].isdigit():
|
||||
digitStart = 1
|
||||
if cleanDOItemp[i].isalpha() and digitStart:
|
||||
break
|
||||
cleanDOI = cleanDOI[0:(8+i)]
|
||||
return cleanDOI
|
||||
|
||||
|
||||
def doi2Bib(doi):
|
||||
"""Return a bibTeX string of metadata for a given DOI.
|
||||
|
||||
From : https://gist.github.com/jrsmith3/5513926
|
||||
"""
|
||||
url = "http://dx.doi.org/" + doi
|
||||
headers = {"accept": "application/x-bibtex"}
|
||||
try:
|
||||
r = requests.get(url, headers=headers)
|
||||
|
||||
if r.headers['content-type'] == 'application/x-bibtex':
|
||||
return r.text
|
||||
else:
|
||||
return ''
|
||||
except requests.exceptions.ConnectionError:
|
||||
tools.warning('Unable to contact remote server to get the bibtex ' +
|
||||
'entry for doi '+doi)
|
||||
return ''
|
||||
|
227
main.py
227
main.py
@ -1,44 +1,22 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import tools
|
||||
import fetcher
|
||||
import backend
|
||||
import tearpages
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
import requests
|
||||
import subprocess
|
||||
import re
|
||||
import os
|
||||
from isbntools import meta
|
||||
from isbntools.dev.fmt import fmtbib
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except:
|
||||
from StringIO import StringIO
|
||||
from bibtexparser.bparser import BibTexParser
|
||||
from bibtexparser.customization import homogeneize_latex_encoding
|
||||
from bibtexparser.bwriter import bibtex as bibTexWriter
|
||||
from termios import tcflush, TCIOFLUSH
|
||||
import params
|
||||
|
||||
EDITOR = os.environ.get('EDITOR') if os.environ.get('EDITOR') else 'vim'
|
||||
|
||||
|
||||
def rawInput(string):
|
||||
tcflush(sys.stdin, TCIOFLUSH)
|
||||
return raw_input(string)
|
||||
|
||||
|
||||
def warning(*objs):
|
||||
"""
|
||||
Write to stderr
|
||||
"""
|
||||
print("WARNING: ", *objs, file=sys.stderr)
|
||||
|
||||
|
||||
def parsed2Bibtex(parsed):
|
||||
"""
|
||||
Convert a single bibtex entry dict to bibtex string
|
||||
@ -51,165 +29,6 @@ def parsed2Bibtex(parsed):
|
||||
return bibtex
|
||||
|
||||
|
||||
def bibtexAppend(data):
|
||||
"""
|
||||
Append data to the main bibtex file
|
||||
data is a dict for one entry in bibtex, as the one from bibtexparser output
|
||||
"""
|
||||
with open(params.folder+'index.bib', 'a') as fh:
|
||||
fh.write(parsed2Bibtex(data)+"\n")
|
||||
|
||||
|
||||
def bibtexRewrite(data):
|
||||
"""
|
||||
Rewrite the bibtex index file.
|
||||
data is a dict of bibtex entry dict.
|
||||
"""
|
||||
bibtex = ''
|
||||
for entry in data.keys():
|
||||
bibtex += parsed2Bibtex(data[entry])+"\n"
|
||||
with open(params.folder+'index.bib', 'w') as fh:
|
||||
fh.write(bibtex)
|
||||
|
||||
|
||||
def replaceAll(text, dic):
|
||||
for i, j in dic.iteritems():
|
||||
text = text.replace(i, j)
|
||||
return text
|
||||
|
||||
|
||||
def findISBN(src):
|
||||
if src.endswith(".pdf"):
|
||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
elif src.endswith(".djvu"):
|
||||
totext = subprocess.Popen(["djvutxt", src],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
extractfull = totext.communicate()
|
||||
if extractfull[1] is not "":
|
||||
return False
|
||||
|
||||
extractfull = extractfull[0]
|
||||
extractISBN = re.search(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
|
||||
extractfull.lower().replace('Œ', '-'))
|
||||
|
||||
cleanISBN = False
|
||||
if extractISBN:
|
||||
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
|
||||
|
||||
return cleanISBN
|
||||
|
||||
|
||||
def isbn2Bib(isbn):
|
||||
try:
|
||||
return fmtbib('bibtex', meta(isbn, 'default'))
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def findDOI(src):
|
||||
if src.endswith(".pdf"):
|
||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
elif src.endswith(".djvu"):
|
||||
totext = subprocess.Popen(["djvutxt", src],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
extractfull = totext.communicate()
|
||||
if extractfull[1] is not "":
|
||||
return False
|
||||
|
||||
extractfull = extractfull[0]
|
||||
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
|
||||
extractfull.lower().replace('Œ', '-'))
|
||||
if not extractDOI:
|
||||
# PNAS fix
|
||||
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
|
||||
extractfull.lower().replace('pnas', '/pnas'))
|
||||
if not extractDOI:
|
||||
# JSB fix
|
||||
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
|
||||
|
||||
cleanDOI = False
|
||||
if extractDOI:
|
||||
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
||||
if re.search('^/', cleanDOI):
|
||||
cleanDOI = cleanDOI[1:]
|
||||
|
||||
# FABSE J fix
|
||||
if re.search('^10.1096', cleanDOI):
|
||||
cleanDOI = cleanDOI[:20]
|
||||
|
||||
# Second JCB fix
|
||||
if re.search('^10.1083', cleanDOI):
|
||||
cleanDOI = cleanDOI[:21]
|
||||
|
||||
if len(cleanDOI) > 40:
|
||||
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
|
||||
reps = {'.': 'A', '-': '0'}
|
||||
cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
|
||||
digitStart = 0
|
||||
for i in range(len(cleanDOItemp)):
|
||||
if cleanDOItemp[i].isdigit():
|
||||
digitStart = 1
|
||||
if cleanDOItemp[i].isalpha() and digitStart:
|
||||
break
|
||||
cleanDOI = cleanDOI[0:(8+i)]
|
||||
|
||||
return cleanDOI
|
||||
|
||||
|
||||
def doi2Bib(doi):
|
||||
"""
|
||||
Return a bibTeX string of metadata for a given DOI.
|
||||
From : https://gist.github.com/jrsmith3/5513926
|
||||
"""
|
||||
url = "http://dx.doi.org/" + doi
|
||||
headers = {"accept": "application/x-bibtex"}
|
||||
try:
|
||||
r = requests.get(url, headers=headers)
|
||||
|
||||
if r.headers['content-type'] == 'application/x-bibtex':
|
||||
return r.text
|
||||
else:
|
||||
return ''
|
||||
except requests.exceptions.ConnectionError:
|
||||
warning('Unable to contact remote server to get the bibtex entry ' +
|
||||
'for doi '+doi)
|
||||
return ''
|
||||
|
||||
|
||||
_slugify_strip_re = re.compile(r'[^\w\s-]')
|
||||
_slugify_hyphenate_re = re.compile(r'[\s]+')
|
||||
|
||||
|
||||
def _slugify(value):
|
||||
"""
|
||||
Normalizes string, converts to lowercase, removes non-alpha characters,
|
||||
and converts spaces to hyphens.
|
||||
|
||||
From Django's "django/template/defaultfilters.py".
|
||||
"""
|
||||
import unicodedata
|
||||
if not isinstance(value, unicode):
|
||||
value = unicode(value)
|
||||
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
|
||||
value = unicode(_slugify_strip_re.sub('', value).strip())
|
||||
return _slugify_hyphenate_re.sub('_', value)
|
||||
|
||||
|
||||
def getExtension(filename):
|
||||
"""
|
||||
Get the extension of the filename
|
||||
"""
|
||||
return filename[filename.rfind('.'):]
|
||||
|
||||
|
||||
def checkBibtex(filename, bibtex):
|
||||
print("The bibtex entry found for "+filename+" is :")
|
||||
|
||||
@ -222,7 +41,7 @@ def checkBibtex(filename, bibtex):
|
||||
else:
|
||||
bibtex_string = ''
|
||||
print(bibtex_string)
|
||||
check = rawInput("Is it correct ? [Y/n] ")
|
||||
check = tools.rawInput("Is it correct ? [Y/n] ")
|
||||
|
||||
while check.lower() == 'n':
|
||||
with tempfile.NamedTemporaryFile(suffix=".tmp") as tmpfile:
|
||||
@ -240,7 +59,7 @@ def checkBibtex(filename, bibtex):
|
||||
bibtex_string = ''
|
||||
print("\nThe bibtex entry for "+filename+" is :")
|
||||
print(bibtex_string)
|
||||
check = rawInput("Is it correct ? [Y/n] ")
|
||||
check = tools.rawInput("Is it correct ? [Y/n] ")
|
||||
return bibtex
|
||||
|
||||
|
||||
@ -256,23 +75,23 @@ def addFile(src, filetype):
|
||||
|
||||
if doi is False and isbn is False:
|
||||
if filetype is None:
|
||||
warning("Could not determine the DOI or the ISBN for "+src+"." +
|
||||
tools.warning("Could not determine the DOI or the ISBN for "+src+"." +
|
||||
"Switching to manual entry.")
|
||||
doi_isbn = ''
|
||||
while doi_isbn not in ['doi', 'isbn']:
|
||||
doi_isbn = rawInput("DOI / ISBN ? ").lower()
|
||||
doi_isbn = tools.rawInput("DOI / ISBN ? ").lower()
|
||||
if doi_isbn == 'doi':
|
||||
doi = rawInput('DOI ? ')
|
||||
doi = tools.rawInput('DOI ? ')
|
||||
else:
|
||||
isbn = rawInput('ISBN ? ')
|
||||
isbn = tools.rawInput('ISBN ? ')
|
||||
elif filetype == 'article':
|
||||
warning("Could not determine the DOI for "+src +
|
||||
tools.warning("Could not determine the DOI for "+src +
|
||||
", switching to manual entry.")
|
||||
doi = rawInput('DOI ? ')
|
||||
doi = tools.rawInput('DOI ? ')
|
||||
elif filetype == 'book':
|
||||
warning("Could not determine the ISBN for "+src +
|
||||
tools.warning("Could not determine the ISBN for "+src +
|
||||
", switching to manual entry.")
|
||||
isbn = rawInput('ISBN ? ')
|
||||
isbn = tools.rawInput('ISBN ? ')
|
||||
elif doi is not False:
|
||||
print("DOI for "+src+" is "+doi+".")
|
||||
elif isbn is not False:
|
||||
@ -310,13 +129,13 @@ def addFile(src, filetype):
|
||||
new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
|
||||
for i in authors]))
|
||||
|
||||
new_name = params.folder+_slugify(new_name)+getExtension(src)
|
||||
new_name = params.folder+tools.slugify(new_name)+tools.getExtension(src)
|
||||
|
||||
while os.path.exists(new_name):
|
||||
warning("file "+new_name+" already exists.")
|
||||
default_rename = new_name.replace(getExtension(new_name),
|
||||
" (2)"+getExtension(new_name))
|
||||
rename = rawInput("New name ["+default_rename+"] ? ")
|
||||
tools.warning("file "+new_name+" already exists.")
|
||||
default_rename = new_name.replace(tools.getExtension(new_name),
|
||||
" (2)"+tools.getExtension(new_name))
|
||||
rename = tools.rawInput("New name ["+default_rename+"] ? ")
|
||||
if rename == '':
|
||||
new_name = default_rename
|
||||
else:
|
||||
@ -351,7 +170,7 @@ def deleteId(ident):
|
||||
try:
|
||||
os.remove(bibtex[ident]['file'])
|
||||
except:
|
||||
warning("Unable to delete file associated to id "+ident+" : " +
|
||||
tools.warning("Unable to delete file associated to id "+ident+" : " +
|
||||
bibtex[ident]['file'])
|
||||
del(bibtex[ident])
|
||||
bibtexRewrite(bibtex)
|
||||
@ -373,7 +192,7 @@ def deleteFile(filename):
|
||||
try:
|
||||
os.remove(bibtex[key]['file'])
|
||||
except:
|
||||
warning("Unable to delete file associated to id "+key+" : " +
|
||||
tools.warning("Unable to delete file associated to id "+key+" : " +
|
||||
bibtex[key]['file'])
|
||||
del(bibtex[key])
|
||||
if found:
|
||||
@ -382,7 +201,7 @@ def deleteFile(filename):
|
||||
|
||||
|
||||
def downloadFile(url, filetype):
|
||||
dl, contenttype = fetcher.download_url(url)
|
||||
dl, contenttype = fetcher.download(url)
|
||||
|
||||
if dl is not False:
|
||||
tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)
|
||||
@ -393,7 +212,7 @@ def downloadFile(url, filetype):
|
||||
tmp.close()
|
||||
return new_name
|
||||
else:
|
||||
warning("Could not fetch "+url)
|
||||
tools.warning("Could not fetch "+url)
|
||||
return False
|
||||
|
||||
|
||||
@ -434,13 +253,13 @@ if __name__ == '__main__':
|
||||
if len(sys.argv) < 3:
|
||||
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
|
||||
|
||||
confirm = rawInput("Are you sure you want to delete "+sys.argv[2] +
|
||||
confirm = tools.rawInput("Are you sure you want to delete "+sys.argv[2] +
|
||||
" ? [y/N] ")
|
||||
|
||||
if confirm.lower() == 'y':
|
||||
if not deleteId(sys.argv[2]):
|
||||
if not deleteFile(sys.argv[2]):
|
||||
warning("Unable to delete "+sys.argv[2])
|
||||
tools.warning("Unable to delete "+sys.argv[2])
|
||||
sys.exit(1)
|
||||
|
||||
print(sys.argv[2]+" successfully deleted.")
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Francois Boulogne
|
||||
# License: GPLv3
|
||||
@ -30,6 +30,7 @@ def fixPdf(pdfFile, destination):
|
||||
output.close()
|
||||
shutil.copy(tmp.name, destination)
|
||||
|
||||
|
||||
def tearpage(filename):
|
||||
"""
|
||||
Copy filename to a tempfile, write pages 1..N to filename.
|
||||
@ -60,8 +61,8 @@ def tearpage(filename):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Remove the first page of a PDF',
|
||||
epilog='')
|
||||
parser = argparse.ArgumentParser(description='Remove the first page ' +
|
||||
'of a PDF', epilog='')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
|
||||
args = parser.parse_args()
|
||||
|
Loading…
Reference in New Issue
Block a user