libbmc/libbmc/papers/tearpages.py

"""
This file contains the necessary functions to determine whether we should tear
the first page from a PDF file, and actually tear it.

TODO: Unittests
"""
import shutil
import tempfile
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError


# Dict of bad journals which adds an extra useless first page, which can be
# teared. Please, submit a PR to include new ones which I may not be aware of!
# This dict associates the journal string to look for and to a list of pages
# to tear.
BAD_JOURNALS = {
    "epl": [0],
    "journal of modern optics": [0],
    "new journal of physics": [0]
}


def fixPdf(pdfFile, destination):
    """
    Fix malformed pdf files when data are present after '%%EOF'

    ..note ::

        Originally from sciunto, https://github.com/sciunto/tear-pages

    :param pdfFile: PDF filepath
    :param destination: destination
    """
    tmp = tempfile.NamedTemporaryFile()
    output = open(tmp.name, 'wb')
    with open(pdfFile, "rb") as fh:
        with open(pdfFile, "rb") as fh:
            for line in fh:
                output.write(line)
                if b'%%EOF' in line:
                    break
    output.close()
    shutil.copy(tmp.name, destination)


def tearpage_backend(filename, teared_pages=[0]):
    """
    Copy filename to a tempfile, write pages to filename except the teared one.

    ..note ::

        Adapted from sciunto's code, https://github.com/sciunto/tear-pages

    :param filename: PDF filepath
    :param teared_pages: Numbers of the pages to tear. Default to first page \
            only.
    """
    # Copy the pdf to a tmp file
    tmp = tempfile.NamedTemporaryFile()
    shutil.copy(filename, tmp.name)

    # Read the copied pdf
    try:
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    except PdfReadError:
        fixPdf(filename, tmp.name)
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    # Seek for the number of pages
    num_pages = input_file.getNumPages()

    # Write pages excepted the first one
    output_file = PdfFileWriter()
    for i in range(num_pages):
        if i in teared_pages:
            continue
        output_file.addPage(input_file.getPage(i))

    tmp.close()
    outputStream = open(filename, "wb")
    output_file.write(outputStream)


def tearpage_needed(bibtex):
    """
    Check whether a given paper needs some pages to be teared or not.

    :params bibtex: The bibtex entry associated to the paper, to guess \
            whether tearing is needed.
    :returns: A list of pages to tear.
    """
    for p in BAD_JOURNALS:
        if p in bibtex.get("journal", "").lower():
            # Bad journal is found, add pages to tear
            return BAD_JOURNALS[p]

    # If no bad journals are found, return an empty list
    return []


def tearpage(filename, bibtex=None, force=False):
    """
    Tear some pages of the file if needed.

    :params filename: Path to the file to handle.
    :params bibtex: BibTeX dict associated to this file, as the one given by \
            ``bibtexparser``. (Mandatory if force is not specified)
    :params force: If a list of integers, force the tearing of the \
            specified pages. (Optional)
    :returns: A boolean indicating whether the file has been teared or not. \
            Side effect is tearing the necessary pages from the file.
    """
    # Fetch pages to tear
    pages_to_tear = []
    if force is not False:
        pages_to_tear = force
    elif bibtex is not None:
        pages_to_tear = tearpage_needed(bibtex)

    if len(pages_to_tear) > 0:
        # If tearing is needed, do it and return True
        tearpage_backend(filename, teared_pages=pages_to_tear)
        return True

    # Else, simply return False
    return False
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`"""`
			`This file contains the necessary functions to determine whether we should tear`
			`the first page from a PDF file, and actually tear it.`
Fix citation fetching from arXiv papers 2016-01-20 23:40:07 +01:00
			`TODO: Unittests`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`"""`
Tearpages ok 2016-01-30 16:28:53 +01:00			`import shutil`
			`import tempfile`
			`from PyPDF2 import PdfFileWriter, PdfFileReader`
			`from PyPDF2.utils import PdfReadError`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00

Use journal instead of publisher for page tearing 2016-01-30 17:18:32 +01:00			`# Dict of bad journals which adds an extra useless first page, which can be`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`# teared. Please, submit a PR to include new ones which I may not be aware of!`
Use journal instead of publisher for page tearing 2016-01-30 17:18:32 +01:00			`# This dict associates the journal string to look for and to a list of pages`
Tearpages ok 2016-01-30 16:28:53 +01:00			`# to tear.`
Use journal instead of publisher for page tearing 2016-01-30 17:18:32 +01:00			`BAD_JOURNALS = {`
			`"epl": [0],`
			`"journal of modern optics": [0],`
			`"new journal of physics": [0]`
Tearpages ok 2016-01-30 16:28:53 +01:00			`}`


			`def fixPdf(pdfFile, destination):`
			`"""`
			`Fix malformed pdf files when data are present after '%%EOF'`

			`..note ::`

			`Originally from sciunto, https://github.com/sciunto/tear-pages`

			`:param pdfFile: PDF filepath`
			`:param destination: destination`
			`"""`
			`tmp = tempfile.NamedTemporaryFile()`
			`output = open(tmp.name, 'wb')`
			`with open(pdfFile, "rb") as fh:`
			`with open(pdfFile, "rb") as fh:`
			`for line in fh:`
			`output.write(line)`
			`if b'%%EOF' in line:`
			`break`
			`output.close()`
			`shutil.copy(tmp.name, destination)`


			`def tearpage_backend(filename, teared_pages=[0]):`
			`"""`
			`Copy filename to a tempfile, write pages to filename except the teared one.`

			`..note ::`

			`Adapted from sciunto's code, https://github.com/sciunto/tear-pages`

			`:param filename: PDF filepath`
			`:param teared_pages: Numbers of the pages to tear. Default to first page \`
			`only.`
			`"""`
			`# Copy the pdf to a tmp file`
			`tmp = tempfile.NamedTemporaryFile()`
			`shutil.copy(filename, tmp.name)`

			`# Read the copied pdf`
			`try:`
			`input_file = PdfFileReader(open(tmp.name, 'rb'))`
			`except PdfReadError:`
			`fixPdf(filename, tmp.name)`
			`input_file = PdfFileReader(open(tmp.name, 'rb'))`
			`# Seek for the number of pages`
			`num_pages = input_file.getNumPages()`

			`# Write pages excepted the first one`
			`output_file = PdfFileWriter()`
			`for i in range(num_pages):`
			`if i in teared_pages:`
			`continue`
			`output_file.addPage(input_file.getPage(i))`

			`tmp.close()`
			`outputStream = open(filename, "wb")`
			`output_file.write(outputStream)`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00

			`def tearpage_needed(bibtex):`
			`"""`
Tearpages ok 2016-01-30 16:28:53 +01:00			`Check whether a given paper needs some pages to be teared or not.`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00
			`:params bibtex: The bibtex entry associated to the paper, to guess \`
			`whether tearing is needed.`
Tearpages ok 2016-01-30 16:28:53 +01:00			`:returns: A list of pages to tear.`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`"""`
Use journal instead of publisher for page tearing 2016-01-30 17:18:32 +01:00			`for p in BAD_JOURNALS:`
			`if p in bibtex.get("journal", "").lower():`
			`# Bad journal is found, add pages to tear`
			`return BAD_JOURNALS[p]`
Tearpages ok 2016-01-30 16:28:53 +01:00
Use journal instead of publisher for page tearing 2016-01-30 17:18:32 +01:00			`# If no bad journals are found, return an empty list`
Tearpages ok 2016-01-30 16:28:53 +01:00			`return []`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00

Tearpages ok 2016-01-30 16:28:53 +01:00			`def tearpage(filename, bibtex=None, force=False):`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`"""`
Add a __valid_identifiers__ list to ease fetching of identifiers in papers See the detailed explanations in README.md. Also fixed some typos in docstrings. 2016-02-01 17:32:24 +01:00			`Tear some pages of the file if needed.`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00
			`:params filename: Path to the file to handle.`
			`:params bibtex: BibTeX dict associated to this file, as the one given by \`
Tearpages ok 2016-01-30 16:28:53 +01:00			``bibtexparser``. (Mandatory if force is not specified)
			`:params force: If a list of integers, force the tearing of the \`
			`specified pages. (Optional)`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`:returns: A boolean indicating whether the file has been teared or not. \`
Tearpages ok 2016-01-30 16:28:53 +01:00			`Side effect is tearing the necessary pages from the file.`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`"""`
Tearpages ok 2016-01-30 16:28:53 +01:00			`# Fetch pages to tear`
			`pages_to_tear = []`
			`if force is not False:`
			`pages_to_tear = force`
			`elif bibtex is not None:`
			`pages_to_tear = tearpage_needed(bibtex)`

			`if len(pages_to_tear) > 0:`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`# If tearing is needed, do it and return True`
Tearpages ok 2016-01-30 16:28:53 +01:00			`tearpage_backend(filename, teared_pages=pages_to_tear)`
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`return True`
Tearpages ok 2016-01-30 16:28:53 +01:00
Add some functions to tear first pages from a PDF 2016-01-10 17:52:45 +01:00			`# Else, simply return False`
			`return False`