From 49df58bf70f59e716cee6d922b80f2c748d24f53 Mon Sep 17 00:00:00 2001
From: Phyks <webmaster@phyks.me>
Date: Sat, 26 Apr 2014 23:26:25 +0200
Subject: [PATCH] Remove first page of IOP papers + various bugfixes

---
 README.md    | 19 ++++++++++-----
 fetcher.py   |  9 +++++--
 main.py      | 63 ++++++++++++++++++++++++++++++++---------------
 tearpages.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 28 deletions(-)
 create mode 100644 tearpages.py

diff --git a/README.md b/README.md
index 1f87950..fd074cd 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Should be almost working and usable now, although still to be considered as **ex
 ## Installation
 
 * Clone this git repository where you want : `git clone https://github.com/Phyks/BMC`
-* Install `requesocks` and `isbntools` _via_ Pypi
+* Install `requesocks`, `PyPDF2` and `isbntools` _via_ Pypi
 * Install `pdftotext` (provided by Xpdf) and `djvulibre` _via_ your package manager the way you want
 * Copy `params.py.example` to `params.py` and customize it to fit your needs
 
@@ -66,6 +66,11 @@ TODO
 
 TODO
 
+
+### Edit entries
+
+TODO
+
 ### Data storage
 
 All your documents will be stored in the papers dir specified in `params.py`. All the bibtex entries will be added to the `index.bib` file. You should **not** add entries to this file (but you can edit existing entries without any problem), as this will break synchronization between documents in papers dir and the index. If you do so, you can rebuild the index fie with `./main.py rebuild`.
@@ -100,16 +105,18 @@ Here are some sources of inspirations for this project :
 
 A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.
 
-* if doi does not exist ?
-* download djvu
-* test file field for bibtex
 * Open
-* Confirmation for deletion
 * Rebuild
-* Remove the watermarks on pdf files : First page of IOP publishing articles => tearpages
 * Webserver interface
 * Various re.compile ?
 * check output of subprocesses before it ends
 * Split main.py
 * Categories
 * Edit an entry instead of deleting it and adding it again
+
+## Issues ?
+
+See upstream
+
+* homogeneize\_latex\_encoding => to implement again, bug with metadata upstream
+* Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in PyPDF2.
diff --git a/fetcher.py b/fetcher.py
index 0a22783..79dd0bf 100755
--- a/fetcher.py
+++ b/fetcher.py
@@ -27,11 +27,16 @@ def download_url(url):
 
         try:
             r = requests.get(url, proxies=r_proxy)
+            contenttype = False
+            if 'pdf' in r.headers['content-type']:
+                contenttype = 'pdf'
+            elif 'djvu' in r.headers['content-type']:
+                contenttype = 'djvu'
 
-            if r.status_code != 200 or 'pdf' not in r.headers['content-type']:
+            if r.status_code != 200 or contenttype is False:
                 continue
 
-            return r.content
+            return r.content, contenttype
         except:
             warning("Proxy "+proxy+" not available.")
             continue
diff --git a/main.py b/main.py
index 25f410f..f79a0ed 100755
--- a/main.py
+++ b/main.py
@@ -4,6 +4,7 @@
 from __future__ import print_function
 
 import fetcher
+import tearpages
 import sys
 import shutil
 import tempfile
@@ -18,7 +19,6 @@ try:
 except:
     from StringIO import StringIO
 from bibtexparser.bparser import BibTexParser
-from bibtexparser.customization import homogeneize_latex_encoding
 from termios import tcflush, TCIOFLUSH
 import params
 
@@ -100,7 +100,10 @@ def findISBN(src):
 
 
 def isbn2Bib(isbn):
-    return fmtbib('bibtex', meta(isbn, 'default'))
+    try:
+        return fmtbib('bibtex', meta(isbn, 'default'))
+    except:
+        return ''
 
 
 def findDOI(src):
@@ -165,7 +168,11 @@ def doi2Bib(doi):
     url = "http://dx.doi.org/" + doi
     headers = {"accept": "application/x-bibtex"}
     r = requests.get(url, headers=headers)
-    return r.text
+
+    if r.headers['content-type'] == 'application/x-bibtex':
+        return r.text
+    else
+        return ''
 
 
 _slugify_strip_re = re.compile(r'[^\w\s-]')
@@ -198,7 +205,7 @@ def checkBibtex(filename, bibtex):
     print("The bibtex entry found for "+filename+" is :")
 
     bibtex = StringIO(bibtex)
-    bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding)
+    bibtex = BibTexParser(bibtex)
     bibtex = bibtex.get_entry_dict()
     bibtex_name = bibtex.keys()[0]
     bibtex = bibtex[bibtex_name]
@@ -261,25 +268,33 @@ def addFile(src, filetype):
     elif isbn is not False:
         print("ISBN for "+src+" is "+isbn+".")
 
-    if doi is not False:
+    if doi is not False and doi != '':
         # Add extra \n for bibtexparser
         bibtex = doi2Bib(doi).strip().replace(',', ",\n")+"\n"
-    else:
+    elif isbn is not False and isbn != '':
         # Idem
         bibtex = isbn2Bib(isbn).strip()+"\n"
+    else:
+        bibtex = ''
     
     bibtex = checkBibtex(src, bibtex)
 
     authors = re.split(' and ', bibtex['author'])
 
-    if doi is not False:
+    if bibtex['type'] == 'article':
         new_name = params.format_articles
-        new_name = new_name.replace("%j", bibtex['journal'])
-    else:
+        try:
+            new_name = new_name.replace("%j", bibtex['journal'])
+        except:
+            pass
+    elif bibtex['type'] == 'book':
         new_name = params.format_books
 
     new_name = new_name.replace("%t", bibtex['title'])
-    new_name = new_name.replace("%Y", bibtex['year'])
+    try:
+        new_name = new_name.replace("%Y", bibtex['year'])
+    except:
+        pass
     new_name = new_name.replace("%f", authors[0].split(',')[0].strip())
     new_name = new_name.replace("%l", authors[-1].split(',')[0].strip())
     new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
@@ -304,6 +319,10 @@ def addFile(src, filetype):
         new_name = False
         sys.exit("Unable to move file to library dir " + params.folder+".")
 
+    # Remove first page of IOP papers
+    if 'IOP' in bibtex['publisher'] and bibtex['type'] == 'article':
+        tearpages.tearpage(new_name)
+
     bibtexAppend(bibtex)
     return new_name
 
@@ -313,7 +332,7 @@ def deleteId(ident):
     Delete a file based on its id in the bibtex file
     """
     with open(params.folder+'index.bib', 'r') as fh:
-        bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
+        bibtex = BibTexParser(fh)
     bibtex = bibtex.get_entry_dict()
 
     if ident not in bibtex.keys():
@@ -334,7 +353,7 @@ def deleteFile(filename):
     Delete a file based on its filename
     """
     with open(params.folder+'index.bib', 'r') as fh:
-        bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
+        bibtex = BibTexParser(fh)
     bibtex = bibtex.get_entry_dict()
 
     found = False
@@ -353,10 +372,10 @@ def deleteFile(filename):
 
 
 def downloadFile(url, filetype):
-    dl = fetcher.download_url(url)
+    dl, contenttype = fetcher.download_url(url)
 
     if dl is not False:
-        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
+        tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)
 
         with open(tmp.name, 'w+') as fh:
             fh.write(dl)
@@ -398,19 +417,23 @@ if __name__ == '__main__':
 
             new_name = addFile(sys.argv[2], filetype)
             if new_name is not False:
-                print("File " + src + " successfully imported as "+new_name+".")
+                print(sys.argv[2]+ " successfully imported as "+new_name+".")
             sys.exit()
 
         elif sys.argv[1] == 'delete':
             if len(sys.argv) < 3:
                 sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
 
-            if not deleteId(sys.argv[2]):
-                if not deleteFile(sys.argv[2]):
-                    warning("Unable to delete "+sys.argv[2])
-                    sys.exit(1)
+            confirm = rawInput("Are you sure you want to delete "+sys.argv[2] +
+                               " ? [y/N] ")
 
-            print(sys.argv[2]+" successfully deleted.")
+            if confirm.lower() == 'y':
+                if not deleteId(sys.argv[2]):
+                    if not deleteFile(sys.argv[2]):
+                        warning("Unable to delete "+sys.argv[2])
+                        sys.exit(1)
+
+                print(sys.argv[2]+" successfully deleted.")
             sys.exit()
 
         elif sys.argv[1] == 'list':
diff --git a/tearpages.py b/tearpages.py
new file mode 100644
index 0000000..10e9e6a
--- /dev/null
+++ b/tearpages.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Francois Boulogne
+# License: GPLv3
+
+__version__ = '0.1'
+
+import argparse
+import shutil
+import tempfile
+from PyPDF2 import PdfFileWriter, PdfFileReader
+from PyPDF2.utils import PdfReadError
+
+
+def fixPdf(pdfFile, destination):
+    """
+    Fix malformed pdf files when data are present after '%%EOF'
+
+    :param pdfFile: PDF filepath
+    :param destination: destination
+    """
+    tmp = tempfile.NamedTemporaryFile()
+    output = open(tmp.name, 'wb')
+    with open(pdfFile, "rb") as fh:
+        with open(pdfFile, "rb") as fh:
+            for line in fh:
+                output.write(line)
+                if b'%%EOF' in line:
+                    break
+    output.close()
+    shutil.copy(tmp.name, destination)
+
+def tearpage(filename):
+    """
+    Copy filename to a tempfile, write pages 1..N to filename.
+
+    :param filename: PDF filepath
+    """
+    # Copy the pdf to a tmp file
+    tmp = tempfile.NamedTemporaryFile()
+    shutil.copy(filename, tmp.name)
+
+    # Read the copied pdf
+    try:
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
+    except PdfReadError:
+        fixPdf(filename, tmp.name)
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
+    # Seek for the number of pages
+    num_pages = input_file.getNumPages()
+
+    # Write pages excepted the first one
+    output_file = PdfFileWriter()
+    for i in range(0, num_pages):
+        output_file.addPage(input_file.getPage(i))
+
+    tmp.close()
+    outputStream = open(filename, "wb")
+    output_file.write(outputStream)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Remove the first page of a PDF',
+                             epilog='')
+    parser.add_argument('--version', action='version', version=__version__)
+    parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
+    args = parser.parse_args()
+
+    tearpage(args.pdf)