2016-01-10 17:52:45 +01:00
|
|
|
"""
|
|
|
|
This file contains the necessary functions to determine whether we should tear
|
|
|
|
the first page from a PDF file, and actually tear it.
|
2016-01-20 23:40:07 +01:00
|
|
|
|
|
|
|
TODO: Unittests
|
2016-01-10 17:52:45 +01:00
|
|
|
"""
|
2016-01-30 16:28:53 +01:00
|
|
|
import shutil
|
|
|
|
import tempfile
|
|
|
|
from PyPDF2 import PdfFileWriter, PdfFileReader
|
|
|
|
from PyPDF2.utils import PdfReadError
|
2016-01-10 17:52:45 +01:00
|
|
|
|
|
|
|
|
2016-01-30 17:18:32 +01:00
|
|
|
# Dict of bad journals which adds an extra useless first page, which can be
|
2016-01-10 17:52:45 +01:00
|
|
|
# teared. Please, submit a PR to include new ones which I may not be aware of!
|
2016-01-30 17:18:32 +01:00
|
|
|
# This dict associates the journal string to look for and to a list of pages
|
2016-01-30 16:28:53 +01:00
|
|
|
# to tear.
|
2016-01-30 17:18:32 +01:00
|
|
|
BAD_JOURNALS = {
|
|
|
|
"epl": [0],
|
|
|
|
"journal of modern optics": [0],
|
|
|
|
"new journal of physics": [0]
|
2016-01-30 16:28:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def fixPdf(pdfFile, destination):
|
|
|
|
"""
|
|
|
|
Fix malformed pdf files when data are present after '%%EOF'
|
|
|
|
|
|
|
|
..note ::
|
|
|
|
|
|
|
|
Originally from sciunto, https://github.com/sciunto/tear-pages
|
|
|
|
|
|
|
|
:param pdfFile: PDF filepath
|
|
|
|
:param destination: destination
|
|
|
|
"""
|
|
|
|
tmp = tempfile.NamedTemporaryFile()
|
|
|
|
output = open(tmp.name, 'wb')
|
|
|
|
with open(pdfFile, "rb") as fh:
|
|
|
|
with open(pdfFile, "rb") as fh:
|
|
|
|
for line in fh:
|
|
|
|
output.write(line)
|
|
|
|
if b'%%EOF' in line:
|
|
|
|
break
|
|
|
|
output.close()
|
|
|
|
shutil.copy(tmp.name, destination)
|
|
|
|
|
|
|
|
|
|
|
|
def tearpage_backend(filename, teared_pages=[0]):
|
|
|
|
"""
|
|
|
|
Copy filename to a tempfile, write pages to filename except the teared one.
|
|
|
|
|
|
|
|
..note ::
|
|
|
|
|
|
|
|
Adapted from sciunto's code, https://github.com/sciunto/tear-pages
|
|
|
|
|
|
|
|
:param filename: PDF filepath
|
|
|
|
:param teared_pages: Numbers of the pages to tear. Default to first page \
|
|
|
|
only.
|
|
|
|
"""
|
|
|
|
# Copy the pdf to a tmp file
|
|
|
|
tmp = tempfile.NamedTemporaryFile()
|
|
|
|
shutil.copy(filename, tmp.name)
|
|
|
|
|
|
|
|
# Read the copied pdf
|
|
|
|
try:
|
|
|
|
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
|
|
|
except PdfReadError:
|
|
|
|
fixPdf(filename, tmp.name)
|
|
|
|
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
|
|
|
# Seek for the number of pages
|
|
|
|
num_pages = input_file.getNumPages()
|
|
|
|
|
|
|
|
# Write pages excepted the first one
|
|
|
|
output_file = PdfFileWriter()
|
|
|
|
for i in range(num_pages):
|
|
|
|
if i in teared_pages:
|
|
|
|
continue
|
|
|
|
output_file.addPage(input_file.getPage(i))
|
|
|
|
|
|
|
|
tmp.close()
|
|
|
|
outputStream = open(filename, "wb")
|
|
|
|
output_file.write(outputStream)
|
2016-01-10 17:52:45 +01:00
|
|
|
|
|
|
|
|
|
|
|
def tearpage_needed(bibtex):
|
|
|
|
"""
|
2016-01-30 16:28:53 +01:00
|
|
|
Check whether a given paper needs some pages to be teared or not.
|
2016-01-10 17:52:45 +01:00
|
|
|
|
|
|
|
:params bibtex: The bibtex entry associated to the paper, to guess \
|
|
|
|
whether tearing is needed.
|
2016-01-30 16:28:53 +01:00
|
|
|
:returns: A list of pages to tear.
|
2016-01-10 17:52:45 +01:00
|
|
|
"""
|
2016-01-30 17:18:32 +01:00
|
|
|
for p in BAD_JOURNALS:
|
|
|
|
if p in bibtex.get("journal", "").lower():
|
|
|
|
# Bad journal is found, add pages to tear
|
|
|
|
return BAD_JOURNALS[p]
|
2016-01-30 16:28:53 +01:00
|
|
|
|
2016-01-30 17:18:32 +01:00
|
|
|
# If no bad journals are found, return an empty list
|
2016-01-30 16:28:53 +01:00
|
|
|
return []
|
2016-01-10 17:52:45 +01:00
|
|
|
|
|
|
|
|
2016-01-30 16:28:53 +01:00
|
|
|
def tearpage(filename, bibtex=None, force=False):
|
2016-01-10 17:52:45 +01:00
|
|
|
"""
|
2016-02-01 17:32:24 +01:00
|
|
|
Tear some pages of the file if needed.
|
2016-01-10 17:52:45 +01:00
|
|
|
|
|
|
|
:params filename: Path to the file to handle.
|
|
|
|
:params bibtex: BibTeX dict associated to this file, as the one given by \
|
2016-01-30 16:28:53 +01:00
|
|
|
``bibtexparser``. (Mandatory if force is not specified)
|
|
|
|
:params force: If a list of integers, force the tearing of the \
|
|
|
|
specified pages. (Optional)
|
2016-01-10 17:52:45 +01:00
|
|
|
:returns: A boolean indicating whether the file has been teared or not. \
|
2016-01-30 16:28:53 +01:00
|
|
|
Side effect is tearing the necessary pages from the file.
|
2016-01-10 17:52:45 +01:00
|
|
|
"""
|
2016-01-30 16:28:53 +01:00
|
|
|
# Fetch pages to tear
|
|
|
|
pages_to_tear = []
|
|
|
|
if force is not False:
|
|
|
|
pages_to_tear = force
|
|
|
|
elif bibtex is not None:
|
|
|
|
pages_to_tear = tearpage_needed(bibtex)
|
|
|
|
|
|
|
|
if len(pages_to_tear) > 0:
|
2016-01-10 17:52:45 +01:00
|
|
|
# If tearing is needed, do it and return True
|
2016-01-30 16:28:53 +01:00
|
|
|
tearpage_backend(filename, teared_pages=pages_to_tear)
|
2016-01-10 17:52:45 +01:00
|
|
|
return True
|
2016-01-30 16:28:53 +01:00
|
|
|
|
2016-01-10 17:52:45 +01:00
|
|
|
# Else, simply return False
|
|
|
|
return False
|