bmc/tearpages.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Author: Francois Boulogne
# License: GPLv3

__version__ = '0.1'

import argparse
import shutil
import tempfile
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError


def fixPdf(pdfFile, destination):
    """
    Fix malformed pdf files when data are present after '%%EOF'

    :param pdfFile: PDF filepath
    :param destination: destination
    """
    tmp = tempfile.NamedTemporaryFile()
    output = open(tmp.name, 'wb')
    with open(pdfFile, "rb") as fh:
        with open(pdfFile, "rb") as fh:
            for line in fh:
                output.write(line)
                if b'%%EOF' in line:
                    break
    output.close()
    shutil.copy(tmp.name, destination)


def tearpage(filename):
    """
    Copy filename to a tempfile, write pages 1..N to filename.

    :param filename: PDF filepath
    """
    # Copy the pdf to a tmp file
    tmp = tempfile.NamedTemporaryFile()
    shutil.copy(filename, tmp.name)

    # Read the copied pdf
    try:
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    except PdfReadError:
        fixPdf(filename, tmp.name)
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    # Seek for the number of pages
    num_pages = input_file.getNumPages()

    # Write pages excepted the first one
    output_file = PdfFileWriter()
    for i in range(1, num_pages):
        output_file.addPage(input_file.getPage(i))

    tmp.close()
    outputStream = open(filename, "wb")
    output_file.write(outputStream)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Remove the first page ' +
                                     'of a PDF', epilog='')
    parser.add_argument('--version', action='version', version=__version__)
    parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
    args = parser.parse_args()

    tearpage(args.pdf)
Beginning of refactor 2014-04-28 22:23:05 +02:00			`#!/usr/bin/env python2`
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`# -- coding: utf-8 --`
			`# Author: Francois Boulogne`
			`# License: GPLv3`

			`__version__ = '0.1'`

			`import argparse`
			`import shutil`
			`import tempfile`
			`from PyPDF2 import PdfFileWriter, PdfFileReader`
			`from PyPDF2.utils import PdfReadError`


			`def fixPdf(pdfFile, destination):`
			`"""`
			`Fix malformed pdf files when data are present after '%%EOF'`

			`:param pdfFile: PDF filepath`
			`:param destination: destination`
			`"""`
			`tmp = tempfile.NamedTemporaryFile()`
			`output = open(tmp.name, 'wb')`
			`with open(pdfFile, "rb") as fh:`
			`with open(pdfFile, "rb") as fh:`
			`for line in fh:`
			`output.write(line)`
			`if b'%%EOF' in line:`
			`break`
			`output.close()`
			`shutil.copy(tmp.name, destination)`

Beginning of refactor 2014-04-28 22:23:05 +02:00
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`def tearpage(filename):`
			`"""`
			`Copy filename to a tempfile, write pages 1..N to filename.`

			`:param filename: PDF filepath`
			`"""`
			`# Copy the pdf to a tmp file`
			`tmp = tempfile.NamedTemporaryFile()`
			`shutil.copy(filename, tmp.name)`

			`# Read the copied pdf`
			`try:`
			`input_file = PdfFileReader(open(tmp.name, 'rb'))`
			`except PdfReadError:`
			`fixPdf(filename, tmp.name)`
			`input_file = PdfFileReader(open(tmp.name, 'rb'))`
			`# Seek for the number of pages`
			`num_pages = input_file.getNumPages()`

			`# Write pages excepted the first one`
			`output_file = PdfFileWriter()`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`for i in range(1, num_pages):`
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`output_file.addPage(input_file.getPage(i))`

			`tmp.close()`
			`outputStream = open(filename, "wb")`
			`output_file.write(outputStream)`


			`if __name__ == '__main__':`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`parser = argparse.ArgumentParser(description='Remove the first page ' +`
			`'of a PDF', epilog='')`
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`parser.add_argument('--version', action='version', version=__version__)`
			`parser.add_argument('pdf', metavar='PDF', help='PDF filepath')`
			`args = parser.parse_args()`

			`tearpage(args.pdf)`