Remove first page of IOP papers + various bugfixes

This commit is contained in:
Phyks 2014-04-26 23:26:25 +02:00
parent 73809b867d
commit 49df58bf70
4 changed files with 132 additions and 28 deletions

View File

@ -34,7 +34,7 @@ Should be almost working and usable now, although still to be considered as **ex
## Installation ## Installation
* Clone this git repository where you want : `git clone https://github.com/Phyks/BMC` * Clone this git repository where you want : `git clone https://github.com/Phyks/BMC`
* Install `requesocks` and `isbntools` _via_ Pypi * Install `requesocks`, `PyPDF2` and `isbntools` _via_ Pypi
* Install `pdftotext` (provided by Xpdf) and `djvulibre` _via_ your package manager the way you want * Install `pdftotext` (provided by Xpdf) and `djvulibre` _via_ your package manager the way you want
* Copy `params.py.example` to `params.py` and customize it to fit your needs * Copy `params.py.example` to `params.py` and customize it to fit your needs
@ -66,6 +66,11 @@ TODO
TODO TODO
### Edit entries
TODO
### Data storage ### Data storage
All your documents will be stored in the papers dir specified in `params.py`. All the bibtex entries will be added to the `index.bib` file. You should **not** add entries to this file (but you can edit existing entries without any problem), as this will break synchronization between documents in papers dir and the index. If you do so, you can rebuild the index fie with `./main.py rebuild`. All your documents will be stored in the papers dir specified in `params.py`. All the bibtex entries will be added to the `index.bib` file. You should **not** add entries to this file (but you can edit existing entries without any problem), as this will break synchronization between documents in papers dir and the index. If you do so, you can rebuild the index fie with `./main.py rebuild`.
@ -100,16 +105,18 @@ Here are some sources of inspirations for this project :
A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns. A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.
* if doi does not exist ?
* download djvu
* test file field for bibtex
* Open * Open
* Confirmation for deletion
* Rebuild * Rebuild
* Remove the watermarks on pdf files : First page of IOP publishing articles => tearpages
* Webserver interface * Webserver interface
* Various re.compile ? * Various re.compile ?
* check output of subprocesses before it ends * check output of subprocesses before it ends
* Split main.py * Split main.py
* Categories * Categories
* Edit an entry instead of deleting it and adding it again * Edit an entry instead of deleting it and adding it again
## Issues ?
See upstream
* homogeneize\_latex\_encoding => to implement again, bug with metadata upstream
* Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in PyPDF2.

View File

@ -27,11 +27,16 @@ def download_url(url):
try: try:
r = requests.get(url, proxies=r_proxy) r = requests.get(url, proxies=r_proxy)
contenttype = False
if 'pdf' in r.headers['content-type']:
contenttype = 'pdf'
elif 'djvu' in r.headers['content-type']:
contenttype = 'djvu'
if r.status_code != 200 or 'pdf' not in r.headers['content-type']: if r.status_code != 200 or contenttype is False:
continue continue
return r.content return r.content, contenttype
except: except:
warning("Proxy "+proxy+" not available.") warning("Proxy "+proxy+" not available.")
continue continue

63
main.py
View File

@ -4,6 +4,7 @@
from __future__ import print_function from __future__ import print_function
import fetcher import fetcher
import tearpages
import sys import sys
import shutil import shutil
import tempfile import tempfile
@ -18,7 +19,6 @@ try:
except: except:
from StringIO import StringIO from StringIO import StringIO
from bibtexparser.bparser import BibTexParser from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import homogeneize_latex_encoding
from termios import tcflush, TCIOFLUSH from termios import tcflush, TCIOFLUSH
import params import params
@ -100,7 +100,10 @@ def findISBN(src):
def isbn2Bib(isbn): def isbn2Bib(isbn):
return fmtbib('bibtex', meta(isbn, 'default')) try:
return fmtbib('bibtex', meta(isbn, 'default'))
except:
return ''
def findDOI(src): def findDOI(src):
@ -165,7 +168,11 @@ def doi2Bib(doi):
url = "http://dx.doi.org/" + doi url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"} headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
return r.text
if r.headers['content-type'] == 'application/x-bibtex':
return r.text
else
return ''
_slugify_strip_re = re.compile(r'[^\w\s-]') _slugify_strip_re = re.compile(r'[^\w\s-]')
@ -198,7 +205,7 @@ def checkBibtex(filename, bibtex):
print("The bibtex entry found for "+filename+" is :") print("The bibtex entry found for "+filename+" is :")
bibtex = StringIO(bibtex) bibtex = StringIO(bibtex)
bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding) bibtex = BibTexParser(bibtex)
bibtex = bibtex.get_entry_dict() bibtex = bibtex.get_entry_dict()
bibtex_name = bibtex.keys()[0] bibtex_name = bibtex.keys()[0]
bibtex = bibtex[bibtex_name] bibtex = bibtex[bibtex_name]
@ -261,25 +268,33 @@ def addFile(src, filetype):
elif isbn is not False: elif isbn is not False:
print("ISBN for "+src+" is "+isbn+".") print("ISBN for "+src+" is "+isbn+".")
if doi is not False: if doi is not False and doi != '':
# Add extra \n for bibtexparser # Add extra \n for bibtexparser
bibtex = doi2Bib(doi).strip().replace(',', ",\n")+"\n" bibtex = doi2Bib(doi).strip().replace(',', ",\n")+"\n"
else: elif isbn is not False and isbn != '':
# Idem # Idem
bibtex = isbn2Bib(isbn).strip()+"\n" bibtex = isbn2Bib(isbn).strip()+"\n"
else:
bibtex = ''
bibtex = checkBibtex(src, bibtex) bibtex = checkBibtex(src, bibtex)
authors = re.split(' and ', bibtex['author']) authors = re.split(' and ', bibtex['author'])
if doi is not False: if bibtex['type'] == 'article':
new_name = params.format_articles new_name = params.format_articles
new_name = new_name.replace("%j", bibtex['journal']) try:
else: new_name = new_name.replace("%j", bibtex['journal'])
except:
pass
elif bibtex['type'] == 'book':
new_name = params.format_books new_name = params.format_books
new_name = new_name.replace("%t", bibtex['title']) new_name = new_name.replace("%t", bibtex['title'])
new_name = new_name.replace("%Y", bibtex['year']) try:
new_name = new_name.replace("%Y", bibtex['year'])
except:
pass
new_name = new_name.replace("%f", authors[0].split(',')[0].strip()) new_name = new_name.replace("%f", authors[0].split(',')[0].strip())
new_name = new_name.replace("%l", authors[-1].split(',')[0].strip()) new_name = new_name.replace("%l", authors[-1].split(',')[0].strip())
new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip() new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
@ -304,6 +319,10 @@ def addFile(src, filetype):
new_name = False new_name = False
sys.exit("Unable to move file to library dir " + params.folder+".") sys.exit("Unable to move file to library dir " + params.folder+".")
# Remove first page of IOP papers
if 'IOP' in bibtex['publisher'] and bibtex['type'] == 'article':
tearpages.tearpage(new_name)
bibtexAppend(bibtex) bibtexAppend(bibtex)
return new_name return new_name
@ -313,7 +332,7 @@ def deleteId(ident):
Delete a file based on its id in the bibtex file Delete a file based on its id in the bibtex file
""" """
with open(params.folder+'index.bib', 'r') as fh: with open(params.folder+'index.bib', 'r') as fh:
bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding) bibtex = BibTexParser(fh)
bibtex = bibtex.get_entry_dict() bibtex = bibtex.get_entry_dict()
if ident not in bibtex.keys(): if ident not in bibtex.keys():
@ -334,7 +353,7 @@ def deleteFile(filename):
Delete a file based on its filename Delete a file based on its filename
""" """
with open(params.folder+'index.bib', 'r') as fh: with open(params.folder+'index.bib', 'r') as fh:
bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding) bibtex = BibTexParser(fh)
bibtex = bibtex.get_entry_dict() bibtex = bibtex.get_entry_dict()
found = False found = False
@ -353,10 +372,10 @@ def deleteFile(filename):
def downloadFile(url, filetype): def downloadFile(url, filetype):
dl = fetcher.download_url(url) dl, contenttype = fetcher.download_url(url)
if dl is not False: if dl is not False:
tmp = tempfile.NamedTemporaryFile(suffix='.pdf') tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)
with open(tmp.name, 'w+') as fh: with open(tmp.name, 'w+') as fh:
fh.write(dl) fh.write(dl)
@ -398,19 +417,23 @@ if __name__ == '__main__':
new_name = addFile(sys.argv[2], filetype) new_name = addFile(sys.argv[2], filetype)
if new_name is not False: if new_name is not False:
print("File " + src + " successfully imported as "+new_name+".") print(sys.argv[2]+ " successfully imported as "+new_name+".")
sys.exit() sys.exit()
elif sys.argv[1] == 'delete': elif sys.argv[1] == 'delete':
if len(sys.argv) < 3: if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID") sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
if not deleteId(sys.argv[2]): confirm = rawInput("Are you sure you want to delete "+sys.argv[2] +
if not deleteFile(sys.argv[2]): " ? [y/N] ")
warning("Unable to delete "+sys.argv[2])
sys.exit(1)
print(sys.argv[2]+" successfully deleted.") if confirm.lower() == 'y':
if not deleteId(sys.argv[2]):
if not deleteFile(sys.argv[2]):
warning("Unable to delete "+sys.argv[2])
sys.exit(1)
print(sys.argv[2]+" successfully deleted.")
sys.exit() sys.exit()
elif sys.argv[1] == 'list': elif sys.argv[1] == 'list':

69
tearpages.py Normal file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Francois Boulogne
# License: GPLv3
__version__ = '0.1'
import argparse
import shutil
import tempfile
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError
def fixPdf(pdfFile, destination):
"""
Fix malformed pdf files when data are present after '%%EOF'
:param pdfFile: PDF filepath
:param destination: destination
"""
tmp = tempfile.NamedTemporaryFile()
output = open(tmp.name, 'wb')
with open(pdfFile, "rb") as fh:
with open(pdfFile, "rb") as fh:
for line in fh:
output.write(line)
if b'%%EOF' in line:
break
output.close()
shutil.copy(tmp.name, destination)
def tearpage(filename):
"""
Copy filename to a tempfile, write pages 1..N to filename.
:param filename: PDF filepath
"""
# Copy the pdf to a tmp file
tmp = tempfile.NamedTemporaryFile()
shutil.copy(filename, tmp.name)
# Read the copied pdf
try:
input_file = PdfFileReader(open(tmp.name, 'rb'))
except PdfReadError:
fixPdf(filename, tmp.name)
input_file = PdfFileReader(open(tmp.name, 'rb'))
# Seek for the number of pages
num_pages = input_file.getNumPages()
# Write pages excepted the first one
output_file = PdfFileWriter()
for i in range(0, num_pages):
output_file.addPage(input_file.getPage(i))
tmp.close()
outputStream = open(filename, "wb")
output_file.write(outputStream)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Remove the first page of a PDF',
epilog='')
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
args = parser.parse_args()
tearpage(args.pdf)