Merge pull request #14 from sciunto/master

Tear pages
This commit is contained in:
Lucas Verney 2014-07-13 17:44:16 +02:00
commit 20517210ff
2 changed files with 58 additions and 2 deletions

View File

@ -56,7 +56,7 @@ Should be almost working and usable now, although still to be considered as **ex
```
git clone https://github.com/Phyks/BMC
```
* Install `arxiv2bib`, `tear-pages`, `requesocks`, `bibtexparser` (https://github.com/sciunto/python-bibtexparser), `PyPDF2` and `isbnlib` _via_ Pypi
* Install `arxiv2bib`, `requesocks`, `bibtexparser` (https://github.com/sciunto/python-bibtexparser), `PyPDF2` and `isbnlib` _via_ Pypi
```
sudo pip install arxiv2bib requesocks bibtexparser pyPDF2 isbnlib
```
@ -132,7 +132,6 @@ All the source code I wrote is under a `no-alcohol beer-ware license`. All funct
* ---------------------------------------------------------------------------------
```
I used the `tearpages.py` script from sciunto, which can be found [here](https://github.com/sciunto/tear-pages) and is released under a GNU GPLv3 license.
## Inspiration

57
libbmc/tearpages.py Executable file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Francois Boulogne
import shutil
import tempfile
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError
def _fixPdf(pdfFile, destination):
"""
Fix malformed pdf files when data are present after '%%EOF'
:param pdfFile: PDF filepath
:param destination: destination
"""
tmp = tempfile.NamedTemporaryFile()
output = open(tmp.name, 'wb')
with open(pdfFile, "rb") as fh:
with open(pdfFile, "rb") as fh:
for line in fh:
output.write(line)
if b'%%EOF' in line:
break
output.close()
shutil.copy(tmp.name, destination)
def tearpage(filename, startpage=1):
"""
Copy filename to a tempfile, write pages startpage..N to filename.
:param filename: PDF filepath
:param startpage: page number for the new first page
"""
# Copy the pdf to a tmp file
tmp = tempfile.NamedTemporaryFile()
shutil.copy(filename, tmp.name)
# Read the copied pdf
try:
input_file = PdfFileReader(open(tmp.name, 'rb'))
except PdfReadError:
_fixPdf(filename, tmp.name)
input_file = PdfFileReader(open(tmp.name, 'rb'))
# Seek for the number of pages
num_pages = input_file.getNumPages()
# Write pages excepted the first one
output_file = PdfFileWriter()
for i in range(startpage, num_pages):
output_file.addPage(input_file.getPage(i))
tmp.close()
outputStream = open(filename, "wb")
output_file.write(outputStream)