Merge pull request #14 from sciunto/master

Tear pages
2014-07-13 17:44:16 +02:00 · 2014-07-13 17:44:16 +02:00 · 20517210ff
commit 20517210ff
parent 818b811e24 61801c50f6
2 changed files with 58 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -56,7 +56,7 @@ Should be almost working and usable now, although still to be considered as **ex
 ```
 git clone https://github.com/Phyks/BMC
 ```
-* Install `arxiv2bib`, `tear-pages`, `requesocks`, `bibtexparser` (https://github.com/sciunto/python-bibtexparser), `PyPDF2` and `isbnlib` _via_ Pypi
+* Install `arxiv2bib`, `requesocks`, `bibtexparser` (https://github.com/sciunto/python-bibtexparser), `PyPDF2` and `isbnlib` _via_ Pypi
 ```
 sudo pip install arxiv2bib requesocks bibtexparser pyPDF2 isbnlib
 ```
@ -132,7 +132,6 @@ All the source code I wrote is under a `no-alcohol beer-ware license`. All funct
 * ---------------------------------------------------------------------------------
 ```

-I used the `tearpages.py` script from sciunto, which can be found [here](https://github.com/sciunto/tear-pages) and is released under a GNU GPLv3 license.

 ## Inspiration

--- a/libbmc/tearpages.py
+++ b/libbmc/tearpages.py
@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Francois Boulogne
+
+import shutil
+import tempfile
+from PyPDF2 import PdfFileWriter, PdfFileReader
+from PyPDF2.utils import PdfReadError
+
+
+def _fixPdf(pdfFile, destination):
+    """
+    Fix malformed pdf files when data are present after '%%EOF'
+
+    :param pdfFile: PDF filepath
+    :param destination: destination
+    """
+    tmp = tempfile.NamedTemporaryFile()
+    output = open(tmp.name, 'wb')
+    with open(pdfFile, "rb") as fh:
+        with open(pdfFile, "rb") as fh:
+            for line in fh:
+                output.write(line)
+                if b'%%EOF' in line:
+                    break
+    output.close()
+    shutil.copy(tmp.name, destination)
+
+
+def tearpage(filename, startpage=1):
+    """
+    Copy filename to a tempfile, write pages startpage..N to filename.
+
+    :param filename: PDF filepath
+    :param startpage: page number for the new first page
+    """
+    # Copy the pdf to a tmp file
+    tmp = tempfile.NamedTemporaryFile()
+    shutil.copy(filename, tmp.name)
+
+    # Read the copied pdf
+    try:
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
+    except PdfReadError:
+        _fixPdf(filename, tmp.name)
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
+    # Seek for the number of pages
+    num_pages = input_file.getNumPages()
+
+    # Write pages excepted the first one
+    output_file = PdfFileWriter()
+    for i in range(startpage, num_pages):
+        output_file.addPage(input_file.getPage(i))
+
+    tmp.close()
+    outputStream = open(filename, "wb")
+    output_file.write(outputStream)