Remove first page of IOP papers + various bugfixes

This commit is contained in:
Phyks 2014-04-26 23:26:25 +02:00
parent 73809b867d
commit 49df58bf70
4 changed files with 132 additions and 28 deletions

View File

@ -34,7 +34,7 @@ Should be almost working and usable now, although still to be considered as **ex
## Installation
* Clone this git repository where you want : `git clone https://github.com/Phyks/BMC`
* Install `requesocks` and `isbntools` _via_ Pypi
* Install `requesocks`, `PyPDF2` and `isbntools` _via_ Pypi
* Install `pdftotext` (provided by Xpdf) and `djvulibre` _via_ your package manager the way you want
* Copy `params.py.example` to `params.py` and customize it to fit your needs
@ -66,6 +66,11 @@ TODO
TODO
### Edit entries
TODO
### Data storage
All your documents will be stored in the papers dir specified in `params.py`. All the bibtex entries will be added to the `index.bib` file. You should **not** add entries to this file (but you can edit existing entries without any problem), as this will break synchronization between documents in papers dir and the index. If you do so, you can rebuild the index fie with `./main.py rebuild`.
@ -100,16 +105,18 @@ Here are some sources of inspirations for this project :
A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.
* if doi does not exist ?
* download djvu
* test file field for bibtex
* Open
* Confirmation for deletion
* Rebuild
* Remove the watermarks on pdf files : First page of IOP publishing articles => tearpages
* Webserver interface
* Various re.compile ?
* check output of subprocesses before it ends
* Split main.py
* Categories
* Edit an entry instead of deleting it and adding it again
## Issues ?
See upstream
* homogeneize\_latex\_encoding => to implement again, bug with metadata upstream
* Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in PyPDF2.

View File

@ -27,11 +27,16 @@ def download_url(url):
try:
r = requests.get(url, proxies=r_proxy)
contenttype = False
if 'pdf' in r.headers['content-type']:
contenttype = 'pdf'
elif 'djvu' in r.headers['content-type']:
contenttype = 'djvu'
if r.status_code != 200 or 'pdf' not in r.headers['content-type']:
if r.status_code != 200 or contenttype is False:
continue
return r.content
return r.content, contenttype
except:
warning("Proxy "+proxy+" not available.")
continue

45
main.py
View File

@ -4,6 +4,7 @@
from __future__ import print_function
import fetcher
import tearpages
import sys
import shutil
import tempfile
@ -18,7 +19,6 @@ try:
except:
from StringIO import StringIO
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import homogeneize_latex_encoding
from termios import tcflush, TCIOFLUSH
import params
@ -100,7 +100,10 @@ def findISBN(src):
def isbn2Bib(isbn):
try:
return fmtbib('bibtex', meta(isbn, 'default'))
except:
return ''
def findDOI(src):
@ -165,7 +168,11 @@ def doi2Bib(doi):
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers=headers)
if r.headers['content-type'] == 'application/x-bibtex':
return r.text
else
return ''
_slugify_strip_re = re.compile(r'[^\w\s-]')
@ -198,7 +205,7 @@ def checkBibtex(filename, bibtex):
print("The bibtex entry found for "+filename+" is :")
bibtex = StringIO(bibtex)
bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding)
bibtex = BibTexParser(bibtex)
bibtex = bibtex.get_entry_dict()
bibtex_name = bibtex.keys()[0]
bibtex = bibtex[bibtex_name]
@ -261,25 +268,33 @@ def addFile(src, filetype):
elif isbn is not False:
print("ISBN for "+src+" is "+isbn+".")
if doi is not False:
if doi is not False and doi != '':
# Add extra \n for bibtexparser
bibtex = doi2Bib(doi).strip().replace(',', ",\n")+"\n"
else:
elif isbn is not False and isbn != '':
# Idem
bibtex = isbn2Bib(isbn).strip()+"\n"
else:
bibtex = ''
bibtex = checkBibtex(src, bibtex)
authors = re.split(' and ', bibtex['author'])
if doi is not False:
if bibtex['type'] == 'article':
new_name = params.format_articles
try:
new_name = new_name.replace("%j", bibtex['journal'])
else:
except:
pass
elif bibtex['type'] == 'book':
new_name = params.format_books
new_name = new_name.replace("%t", bibtex['title'])
try:
new_name = new_name.replace("%Y", bibtex['year'])
except:
pass
new_name = new_name.replace("%f", authors[0].split(',')[0].strip())
new_name = new_name.replace("%l", authors[-1].split(',')[0].strip())
new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
@ -304,6 +319,10 @@ def addFile(src, filetype):
new_name = False
sys.exit("Unable to move file to library dir " + params.folder+".")
# Remove first page of IOP papers
if 'IOP' in bibtex['publisher'] and bibtex['type'] == 'article':
tearpages.tearpage(new_name)
bibtexAppend(bibtex)
return new_name
@ -313,7 +332,7 @@ def deleteId(ident):
Delete a file based on its id in the bibtex file
"""
with open(params.folder+'index.bib', 'r') as fh:
bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
bibtex = BibTexParser(fh)
bibtex = bibtex.get_entry_dict()
if ident not in bibtex.keys():
@ -334,7 +353,7 @@ def deleteFile(filename):
Delete a file based on its filename
"""
with open(params.folder+'index.bib', 'r') as fh:
bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
bibtex = BibTexParser(fh)
bibtex = bibtex.get_entry_dict()
found = False
@ -353,10 +372,10 @@ def deleteFile(filename):
def downloadFile(url, filetype):
dl = fetcher.download_url(url)
dl, contenttype = fetcher.download_url(url)
if dl is not False:
tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)
with open(tmp.name, 'w+') as fh:
fh.write(dl)
@ -398,13 +417,17 @@ if __name__ == '__main__':
new_name = addFile(sys.argv[2], filetype)
if new_name is not False:
print("File " + src + " successfully imported as "+new_name+".")
print(sys.argv[2]+ " successfully imported as "+new_name+".")
sys.exit()
elif sys.argv[1] == 'delete':
if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
confirm = rawInput("Are you sure you want to delete "+sys.argv[2] +
" ? [y/N] ")
if confirm.lower() == 'y':
if not deleteId(sys.argv[2]):
if not deleteFile(sys.argv[2]):
warning("Unable to delete "+sys.argv[2])

69
tearpages.py Normal file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Francois Boulogne
# License: GPLv3
__version__ = '0.1'
import argparse
import shutil
import tempfile
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError
def fixPdf(pdfFile, destination):
"""
Fix malformed pdf files when data are present after '%%EOF'
:param pdfFile: PDF filepath
:param destination: destination
"""
tmp = tempfile.NamedTemporaryFile()
output = open(tmp.name, 'wb')
with open(pdfFile, "rb") as fh:
with open(pdfFile, "rb") as fh:
for line in fh:
output.write(line)
if b'%%EOF' in line:
break
output.close()
shutil.copy(tmp.name, destination)
def tearpage(filename):
"""
Copy filename to a tempfile, write pages 1..N to filename.
:param filename: PDF filepath
"""
# Copy the pdf to a tmp file
tmp = tempfile.NamedTemporaryFile()
shutil.copy(filename, tmp.name)
# Read the copied pdf
try:
input_file = PdfFileReader(open(tmp.name, 'rb'))
except PdfReadError:
fixPdf(filename, tmp.name)
input_file = PdfFileReader(open(tmp.name, 'rb'))
# Seek for the number of pages
num_pages = input_file.getNumPages()
# Write pages excepted the first one
output_file = PdfFileWriter()
for i in range(0, num_pages):
output_file.addPage(input_file.getPage(i))
tmp.close()
outputStream = open(filename, "wb")
output_file.write(outputStream)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Remove the first page of a PDF',
epilog='')
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
args = parser.parse_args()
tearpage(args.pdf)