Remove first page of IOP papers + various bugfixes

2014-04-26 23:26:25 +02:00 · 2014-04-26 23:26:25 +02:00 · 49df58bf70
commit 49df58bf70
parent 73809b867d
4 changed files with 132 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -34,7 +34,7 @@ Should be almost working and usable now, although still to be considered as **ex
 ## Installation

 * Clone this git repository where you want : `git clone https://github.com/Phyks/BMC`
-* Install `requesocks` and `isbntools` _via_ Pypi
+* Install `requesocks`, `PyPDF2` and `isbntools` _via_ Pypi
 * Install `pdftotext` (provided by Xpdf) and `djvulibre` _via_ your package manager the way you want
 * Copy `params.py.example` to `params.py` and customize it to fit your needs

@ -66,6 +66,11 @@ TODO

 TODO

+
+### Edit entries
+
+TODO
+
 ### Data storage

 All your documents will be stored in the papers dir specified in `params.py`. All the bibtex entries will be added to the `index.bib` file. You should **not** add entries to this file (but you can edit existing entries without any problem), as this will break synchronization between documents in papers dir and the index. If you do so, you can rebuild the index fie with `./main.py rebuild`.
@ -100,16 +105,18 @@ Here are some sources of inspirations for this project :

 A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.

-* if doi does not exist ?
-* download djvu
-* test file field for bibtex
 * Open
-* Confirmation for deletion
 * Rebuild
-* Remove the watermarks on pdf files : First page of IOP publishing articles => tearpages
 * Webserver interface
 * Various re.compile ?
 * check output of subprocesses before it ends
 * Split main.py
 * Categories
 * Edit an entry instead of deleting it and adding it again
+
+## Issues ?
+
+See upstream
+
+* homogeneize\_latex\_encoding => to implement again, bug with metadata upstream
+* Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in PyPDF2.
--- a/fetcher.py
+++ b/fetcher.py
@ -27,11 +27,16 @@ def download_url(url):

        try:
            r = requests.get(url, proxies=r_proxy)
+            contenttype = False
+            if 'pdf' in r.headers['content-type']:
+                contenttype = 'pdf'
+            elif 'djvu' in r.headers['content-type']:
+                contenttype = 'djvu'

-            if r.status_code != 200 or 'pdf' not in r.headers['content-type']:
+            if r.status_code != 200 or contenttype is False:
                continue

-            return r.content
+            return r.content, contenttype
        except:
            warning("Proxy "+proxy+" not available.")
            continue
--- a/main.py
+++ b/main.py
@ -4,6 +4,7 @@
 from __future__ import print_function

 import fetcher
+import tearpages
 import sys
 import shutil
 import tempfile
@ -18,7 +19,6 @@ try:
 except:
    from StringIO import StringIO
 from bibtexparser.bparser import BibTexParser
-from bibtexparser.customization import homogeneize_latex_encoding
 from termios import tcflush, TCIOFLUSH
 import params

@ -100,7 +100,10 @@ def findISBN(src):


 def isbn2Bib(isbn):
+    try:
        return fmtbib('bibtex', meta(isbn, 'default'))
+    except:
+        return ''


 def findDOI(src):
@ -165,7 +168,11 @@ def doi2Bib(doi):
    url = "http://dx.doi.org/" + doi
    headers = {"accept": "application/x-bibtex"}
    r = requests.get(url, headers=headers)
+
+    if r.headers['content-type'] == 'application/x-bibtex':
        return r.text
+    else
+        return ''


 _slugify_strip_re = re.compile(r'[^\w\s-]')
@ -198,7 +205,7 @@ def checkBibtex(filename, bibtex):
    print("The bibtex entry found for "+filename+" is :")

    bibtex = StringIO(bibtex)
-    bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding)
+    bibtex = BibTexParser(bibtex)
    bibtex = bibtex.get_entry_dict()
    bibtex_name = bibtex.keys()[0]
    bibtex = bibtex[bibtex_name]
@ -261,25 +268,33 @@ def addFile(src, filetype):
    elif isbn is not False:
        print("ISBN for "+src+" is "+isbn+".")

-    if doi is not False:
+    if doi is not False and doi != '':
        # Add extra \n for bibtexparser
        bibtex = doi2Bib(doi).strip().replace(',', ",\n")+"\n"
-    else:
+    elif isbn is not False and isbn != '':
        # Idem
        bibtex = isbn2Bib(isbn).strip()+"\n"
+    else:
+        bibtex = ''
    
    bibtex = checkBibtex(src, bibtex)

    authors = re.split(' and ', bibtex['author'])

-    if doi is not False:
+    if bibtex['type'] == 'article':
        new_name = params.format_articles
+        try:
            new_name = new_name.replace("%j", bibtex['journal'])
-    else:
+        except:
+            pass
+    elif bibtex['type'] == 'book':
        new_name = params.format_books

    new_name = new_name.replace("%t", bibtex['title'])
+    try:
        new_name = new_name.replace("%Y", bibtex['year'])
+    except:
+        pass
    new_name = new_name.replace("%f", authors[0].split(',')[0].strip())
    new_name = new_name.replace("%l", authors[-1].split(',')[0].strip())
    new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
@ -304,6 +319,10 @@ def addFile(src, filetype):
        new_name = False
        sys.exit("Unable to move file to library dir " + params.folder+".")

+    # Remove first page of IOP papers
+    if 'IOP' in bibtex['publisher'] and bibtex['type'] == 'article':
+        tearpages.tearpage(new_name)
+
    bibtexAppend(bibtex)
    return new_name

@ -313,7 +332,7 @@ def deleteId(ident):
    Delete a file based on its id in the bibtex file
    """
    with open(params.folder+'index.bib', 'r') as fh:
-        bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
+        bibtex = BibTexParser(fh)
    bibtex = bibtex.get_entry_dict()

    if ident not in bibtex.keys():
@ -334,7 +353,7 @@ def deleteFile(filename):
    Delete a file based on its filename
    """
    with open(params.folder+'index.bib', 'r') as fh:
-        bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
+        bibtex = BibTexParser(fh)
    bibtex = bibtex.get_entry_dict()

    found = False
@ -353,10 +372,10 @@ def deleteFile(filename):


 def downloadFile(url, filetype):
-    dl = fetcher.download_url(url)
+    dl, contenttype = fetcher.download_url(url)

    if dl is not False:
-        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
+        tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)

        with open(tmp.name, 'w+') as fh:
            fh.write(dl)
@ -398,13 +417,17 @@ if __name__ == '__main__':

            new_name = addFile(sys.argv[2], filetype)
            if new_name is not False:
-                print("File " + src + " successfully imported as "+new_name+".")
+                print(sys.argv[2]+ " successfully imported as "+new_name+".")
            sys.exit()

        elif sys.argv[1] == 'delete':
            if len(sys.argv) < 3:
                sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")

+            confirm = rawInput("Are you sure you want to delete "+sys.argv[2] +
+                               " ? [y/N] ")
+
+            if confirm.lower() == 'y':
                if not deleteId(sys.argv[2]):
                    if not deleteFile(sys.argv[2]):
                        warning("Unable to delete "+sys.argv[2])
--- a/tearpages.py
+++ b/tearpages.py
@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Francois Boulogne
+# License: GPLv3
+
+__version__ = '0.1'
+
+import argparse
+import shutil
+import tempfile
+from PyPDF2 import PdfFileWriter, PdfFileReader
+from PyPDF2.utils import PdfReadError
+
+
+def fixPdf(pdfFile, destination):
+    """
+    Fix malformed pdf files when data are present after '%%EOF'
+
+    :param pdfFile: PDF filepath
+    :param destination: destination
+    """
+    tmp = tempfile.NamedTemporaryFile()
+    output = open(tmp.name, 'wb')
+    with open(pdfFile, "rb") as fh:
+        with open(pdfFile, "rb") as fh:
+            for line in fh:
+                output.write(line)
+                if b'%%EOF' in line:
+                    break
+    output.close()
+    shutil.copy(tmp.name, destination)
+
+def tearpage(filename):
+    """
+    Copy filename to a tempfile, write pages 1..N to filename.
+
+    :param filename: PDF filepath
+    """
+    # Copy the pdf to a tmp file
+    tmp = tempfile.NamedTemporaryFile()
+    shutil.copy(filename, tmp.name)
+
+    # Read the copied pdf
+    try:
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
+    except PdfReadError:
+        fixPdf(filename, tmp.name)
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
+    # Seek for the number of pages
+    num_pages = input_file.getNumPages()
+
+    # Write pages excepted the first one
+    output_file = PdfFileWriter()
+    for i in range(0, num_pages):
+        output_file.addPage(input_file.getPage(i))
+
+    tmp.close()
+    outputStream = open(filename, "wb")
+    output_file.write(outputStream)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Remove the first page of a PDF',
+                             epilog='')
+    parser.add_argument('--version', action='version', version=__version__)
+    parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
+    args = parser.parse_args()
+
+    tearpage(args.pdf)