diff --git a/README.md b/README.md index 3e85ac6..66b6ae5 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,6 @@ libBMC ====== -This is a **WIP**. - -## TODO - -* Generate documentation - - ## Presentation A generic Python library to manage bibliography and play with scientific diff --git a/libbmc/papers/tearpages.py b/libbmc/papers/tearpages.py index 3cd2f24..016f00f 100644 --- a/libbmc/papers/tearpages.py +++ b/libbmc/papers/tearpages.py @@ -4,44 +4,121 @@ the first page from a PDF file, and actually tear it. TODO: Unittests """ -import tearpages +import shutil +import tempfile +from PyPDF2 import PdfFileWriter, PdfFileReader +from PyPDF2.utils import PdfReadError -# List of bad publishers which adds an extra useless first page, which can be +# Dict of bad publishers which adds an extra useless first page, which can be # teared. Please, submit a PR to include new ones which I may not be aware of! -BAD_PUBLISHERS = [ - "IOP" -] +# This dict associates the publisher string to look for and to a list of pages +# to tear. +BAD_PUBLISHERS = { + "IOP": [0] +} + + +def fixPdf(pdfFile, destination): + """ + Fix malformed pdf files when data are present after '%%EOF' + + ..note :: + + Originally from sciunto, https://github.com/sciunto/tear-pages + + :param pdfFile: PDF filepath + :param destination: destination + """ + tmp = tempfile.NamedTemporaryFile() + output = open(tmp.name, 'wb') + with open(pdfFile, "rb") as fh: + with open(pdfFile, "rb") as fh: + for line in fh: + output.write(line) + if b'%%EOF' in line: + break + output.close() + shutil.copy(tmp.name, destination) + + +def tearpage_backend(filename, teared_pages=[0]): + """ + Copy filename to a tempfile, write pages to filename except the teared one. + + ..note :: + + Adapted from sciunto's code, https://github.com/sciunto/tear-pages + + :param filename: PDF filepath + :param teared_pages: Numbers of the pages to tear. Default to first page \ + only. + """ + # Copy the pdf to a tmp file + tmp = tempfile.NamedTemporaryFile() + shutil.copy(filename, tmp.name) + + # Read the copied pdf + try: + input_file = PdfFileReader(open(tmp.name, 'rb')) + except PdfReadError: + fixPdf(filename, tmp.name) + input_file = PdfFileReader(open(tmp.name, 'rb')) + # Seek for the number of pages + num_pages = input_file.getNumPages() + + # Write pages excepted the first one + output_file = PdfFileWriter() + for i in range(num_pages): + if i in teared_pages: + continue + output_file.addPage(input_file.getPage(i)) + + tmp.close() + outputStream = open(filename, "wb") + output_file.write(outputStream) def tearpage_needed(bibtex): """ - Check whether a given paper needs the first page to be teared or not. + Check whether a given paper needs some pages to be teared or not. :params bibtex: The bibtex entry associated to the paper, to guess \ whether tearing is needed. - :returns: A boolean indicating whether first page should be teared or not. + :returns: A list of pages to tear. """ - # For each bad publisher, look for it in the publisher bibtex entry - has_bad_publisher = [p in bibtex.get("publisher", []) - for p in BAD_PUBLISHERS] - # Return True iff there is at least one bad publisher - return (True in has_bad_publisher) + for p in BAD_PUBLISHERS: + if p in bibtex.get("publisher", ""): + # Bad publisher is found, add pages to tear + return BAD_PUBLISHERS[p] + + # If no bad publishers are found, return an empty list + return [] -def tearpage(filename, bibtex=None): +def tearpage(filename, bibtex=None, force=False): """ - Tear the first page of the file if needed. + Tear the some pages of the file if needed. :params filename: Path to the file to handle. :params bibtex: BibTeX dict associated to this file, as the one given by \ - ``bibtexparser``. + ``bibtexparser``. (Mandatory if force is not specified) + :params force: If a list of integers, force the tearing of the \ + specified pages. (Optional) :returns: A boolean indicating whether the file has been teared or not. \ - Side effect is tearing the first page from the file. + Side effect is tearing the necessary pages from the file. """ - if bibtex is not None and tearpage_needed(bibtex): + # Fetch pages to tear + pages_to_tear = [] + if force is not False: + pages_to_tear = force + elif bibtex is not None: + pages_to_tear = tearpage_needed(bibtex) + + if len(pages_to_tear) > 0: # If tearing is needed, do it and return True - tearpages.tearpage(filename) + tearpage_backend(filename, teared_pages=pages_to_tear) return True + # Else, simply return False return False diff --git a/requirements.txt b/requirements.txt index 8386216..a6e4aac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ bibtexparser>=0.6.2 isbnlib>=3.5.7 requests>=2.9.1 PySocks>=1.5.6 -#tear-pages>=0.2.2 +PyPDF2>=1.25.1