Merge pull request #14 from sciunto/master

Tear pages
2014-07-13 17:44:16 +02:00 · 2014-07-13 17:44:16 +02:00 · 20517210ff
commit 20517210ff
parent 818b811e24 61801c50f6
2 changed files with 58 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -56,7 +56,7 @@ Should be almost working and usable now, although still to be considered as **ex
 ```
 git clone https://github.com/Phyks/BMC
 ```
-* Install `arxiv2bib`, `tear-pages`, `requesocks`, `bibtexparser` (https://github.com/sciunto/python-bibtexparser), `PyPDF2` and `isbnlib` _via_ Pypi
+* Install `arxiv2bib`, `requesocks`, `bibtexparser` (https://github.com/sciunto/python-bibtexparser), `PyPDF2` and `isbnlib` _via_ Pypi
 ```
 sudo pip install arxiv2bib requesocks bibtexparser pyPDF2 isbnlib
 ```
@ -132,7 +132,6 @@ All the source code I wrote is under a `no-alcohol beer-ware license`. All funct
 * ---------------------------------------------------------------------------------
 ```
 I used the `tearpages.py` script from sciunto, which can be found [here](https://github.com/sciunto/tear-pages) and is released under a GNU GPLv3 license.
 ## Inspiration
--- a/libbmc/tearpages.py
+++ b/libbmc/tearpages.py
@ -0,0 +1,57 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Francois Boulogne
 import shutil
 import tempfile
 from PyPDF2 import PdfFileWriter, PdfFileReader
 from PyPDF2.utils import PdfReadError
 def _fixPdf(pdfFile, destination):
    """
    Fix malformed pdf files when data are present after '%%EOF'
    :param pdfFile: PDF filepath
    :param destination: destination
    """
    tmp = tempfile.NamedTemporaryFile()
    output = open(tmp.name, 'wb')
    with open(pdfFile, "rb") as fh:
        with open(pdfFile, "rb") as fh:
            for line in fh:
                output.write(line)
                if b'%%EOF' in line:
                    break
    output.close()
    shutil.copy(tmp.name, destination)
 def tearpage(filename, startpage=1):
    """
    Copy filename to a tempfile, write pages startpage..N to filename.
    :param filename: PDF filepath
    :param startpage: page number for the new first page
    """
    # Copy the pdf to a tmp file
    tmp = tempfile.NamedTemporaryFile()
    shutil.copy(filename, tmp.name)
    # Read the copied pdf
    try:
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    except PdfReadError:
        _fixPdf(filename, tmp.name)
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    # Seek for the number of pages
    num_pages = input_file.getNumPages()
    # Write pages excepted the first one
    output_file = PdfFileWriter()
    for i in range(startpage, num_pages):
        output_file.addPage(input_file.getPage(i))
    tmp.close()
    outputStream = open(filename, "wb")
    output_file.write(outputStream)