libbmc/libbmc/papers/tearpages.py

125 lines
3.6 KiB
Python

"""
This file contains the necessary functions to determine whether we should tear
the first page from a PDF file, and actually tear it.
TODO: Unittests
"""
import shutil
import tempfile
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError
# Dict of bad publishers which adds an extra useless first page, which can be
# teared. Please, submit a PR to include new ones which I may not be aware of!
# This dict associates the publisher string to look for and to a list of pages
# to tear.
BAD_PUBLISHERS = {
"IOP": [0]
}
def fixPdf(pdfFile, destination):
"""
Fix malformed pdf files when data are present after '%%EOF'
..note ::
Originally from sciunto, https://github.com/sciunto/tear-pages
:param pdfFile: PDF filepath
:param destination: destination
"""
tmp = tempfile.NamedTemporaryFile()
output = open(tmp.name, 'wb')
with open(pdfFile, "rb") as fh:
with open(pdfFile, "rb") as fh:
for line in fh:
output.write(line)
if b'%%EOF' in line:
break
output.close()
shutil.copy(tmp.name, destination)
def tearpage_backend(filename, teared_pages=[0]):
"""
Copy filename to a tempfile, write pages to filename except the teared one.
..note ::
Adapted from sciunto's code, https://github.com/sciunto/tear-pages
:param filename: PDF filepath
:param teared_pages: Numbers of the pages to tear. Default to first page \
only.
"""
# Copy the pdf to a tmp file
tmp = tempfile.NamedTemporaryFile()
shutil.copy(filename, tmp.name)
# Read the copied pdf
try:
input_file = PdfFileReader(open(tmp.name, 'rb'))
except PdfReadError:
fixPdf(filename, tmp.name)
input_file = PdfFileReader(open(tmp.name, 'rb'))
# Seek for the number of pages
num_pages = input_file.getNumPages()
# Write pages excepted the first one
output_file = PdfFileWriter()
for i in range(num_pages):
if i in teared_pages:
continue
output_file.addPage(input_file.getPage(i))
tmp.close()
outputStream = open(filename, "wb")
output_file.write(outputStream)
def tearpage_needed(bibtex):
"""
Check whether a given paper needs some pages to be teared or not.
:params bibtex: The bibtex entry associated to the paper, to guess \
whether tearing is needed.
:returns: A list of pages to tear.
"""
for p in BAD_PUBLISHERS:
if p in bibtex.get("publisher", ""):
# Bad publisher is found, add pages to tear
return BAD_PUBLISHERS[p]
# If no bad publishers are found, return an empty list
return []
def tearpage(filename, bibtex=None, force=False):
"""
Tear the some pages of the file if needed.
:params filename: Path to the file to handle.
:params bibtex: BibTeX dict associated to this file, as the one given by \
``bibtexparser``. (Mandatory if force is not specified)
:params force: If a list of integers, force the tearing of the \
specified pages. (Optional)
:returns: A boolean indicating whether the file has been teared or not. \
Side effect is tearing the necessary pages from the file.
"""
# Fetch pages to tear
pages_to_tear = []
if force is not False:
pages_to_tear = force
elif bibtex is not None:
pages_to_tear = tearpage_needed(bibtex)
if len(pages_to_tear) > 0:
# If tearing is needed, do it and return True
tearpage_backend(filename, teared_pages=pages_to_tear)
return True
# Else, simply return False
return False