Remove first page of IOP papers + various bugfixes
This commit is contained in:
parent
73809b867d
commit
49df58bf70
19
README.md
19
README.md
@ -34,7 +34,7 @@ Should be almost working and usable now, although still to be considered as **ex
|
||||
## Installation
|
||||
|
||||
* Clone this git repository where you want : `git clone https://github.com/Phyks/BMC`
|
||||
* Install `requesocks` and `isbntools` _via_ Pypi
|
||||
* Install `requesocks`, `PyPDF2` and `isbntools` _via_ Pypi
|
||||
* Install `pdftotext` (provided by Xpdf) and `djvulibre` _via_ your package manager the way you want
|
||||
* Copy `params.py.example` to `params.py` and customize it to fit your needs
|
||||
|
||||
@ -66,6 +66,11 @@ TODO
|
||||
|
||||
TODO
|
||||
|
||||
|
||||
### Edit entries
|
||||
|
||||
TODO
|
||||
|
||||
### Data storage
|
||||
|
||||
All your documents will be stored in the papers dir specified in `params.py`. All the bibtex entries will be added to the `index.bib` file. You should **not** add entries to this file (but you can edit existing entries without any problem), as this will break synchronization between documents in papers dir and the index. If you do so, you can rebuild the index fie with `./main.py rebuild`.
|
||||
@ -100,16 +105,18 @@ Here are some sources of inspirations for this project :
|
||||
|
||||
A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.
|
||||
|
||||
* if doi does not exist ?
|
||||
* download djvu
|
||||
* test file field for bibtex
|
||||
* Open
|
||||
* Confirmation for deletion
|
||||
* Rebuild
|
||||
* Remove the watermarks on pdf files : First page of IOP publishing articles => tearpages
|
||||
* Webserver interface
|
||||
* Various re.compile ?
|
||||
* check output of subprocesses before it ends
|
||||
* Split main.py
|
||||
* Categories
|
||||
* Edit an entry instead of deleting it and adding it again
|
||||
|
||||
## Issues ?
|
||||
|
||||
See upstream
|
||||
|
||||
* homogeneize\_latex\_encoding => to implement again, bug with metadata upstream
|
||||
* Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in PyPDF2.
|
||||
|
@ -27,11 +27,16 @@ def download_url(url):
|
||||
|
||||
try:
|
||||
r = requests.get(url, proxies=r_proxy)
|
||||
contenttype = False
|
||||
if 'pdf' in r.headers['content-type']:
|
||||
contenttype = 'pdf'
|
||||
elif 'djvu' in r.headers['content-type']:
|
||||
contenttype = 'djvu'
|
||||
|
||||
if r.status_code != 200 or 'pdf' not in r.headers['content-type']:
|
||||
if r.status_code != 200 or contenttype is False:
|
||||
continue
|
||||
|
||||
return r.content
|
||||
return r.content, contenttype
|
||||
except:
|
||||
warning("Proxy "+proxy+" not available.")
|
||||
continue
|
||||
|
45
main.py
45
main.py
@ -4,6 +4,7 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import fetcher
|
||||
import tearpages
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
@ -18,7 +19,6 @@ try:
|
||||
except:
|
||||
from StringIO import StringIO
|
||||
from bibtexparser.bparser import BibTexParser
|
||||
from bibtexparser.customization import homogeneize_latex_encoding
|
||||
from termios import tcflush, TCIOFLUSH
|
||||
import params
|
||||
|
||||
@ -100,7 +100,10 @@ def findISBN(src):
|
||||
|
||||
|
||||
def isbn2Bib(isbn):
|
||||
try:
|
||||
return fmtbib('bibtex', meta(isbn, 'default'))
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def findDOI(src):
|
||||
@ -165,7 +168,11 @@ def doi2Bib(doi):
|
||||
url = "http://dx.doi.org/" + doi
|
||||
headers = {"accept": "application/x-bibtex"}
|
||||
r = requests.get(url, headers=headers)
|
||||
|
||||
if r.headers['content-type'] == 'application/x-bibtex':
|
||||
return r.text
|
||||
else
|
||||
return ''
|
||||
|
||||
|
||||
_slugify_strip_re = re.compile(r'[^\w\s-]')
|
||||
@ -198,7 +205,7 @@ def checkBibtex(filename, bibtex):
|
||||
print("The bibtex entry found for "+filename+" is :")
|
||||
|
||||
bibtex = StringIO(bibtex)
|
||||
bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding)
|
||||
bibtex = BibTexParser(bibtex)
|
||||
bibtex = bibtex.get_entry_dict()
|
||||
bibtex_name = bibtex.keys()[0]
|
||||
bibtex = bibtex[bibtex_name]
|
||||
@ -261,25 +268,33 @@ def addFile(src, filetype):
|
||||
elif isbn is not False:
|
||||
print("ISBN for "+src+" is "+isbn+".")
|
||||
|
||||
if doi is not False:
|
||||
if doi is not False and doi != '':
|
||||
# Add extra \n for bibtexparser
|
||||
bibtex = doi2Bib(doi).strip().replace(',', ",\n")+"\n"
|
||||
else:
|
||||
elif isbn is not False and isbn != '':
|
||||
# Idem
|
||||
bibtex = isbn2Bib(isbn).strip()+"\n"
|
||||
else:
|
||||
bibtex = ''
|
||||
|
||||
bibtex = checkBibtex(src, bibtex)
|
||||
|
||||
authors = re.split(' and ', bibtex['author'])
|
||||
|
||||
if doi is not False:
|
||||
if bibtex['type'] == 'article':
|
||||
new_name = params.format_articles
|
||||
try:
|
||||
new_name = new_name.replace("%j", bibtex['journal'])
|
||||
else:
|
||||
except:
|
||||
pass
|
||||
elif bibtex['type'] == 'book':
|
||||
new_name = params.format_books
|
||||
|
||||
new_name = new_name.replace("%t", bibtex['title'])
|
||||
try:
|
||||
new_name = new_name.replace("%Y", bibtex['year'])
|
||||
except:
|
||||
pass
|
||||
new_name = new_name.replace("%f", authors[0].split(',')[0].strip())
|
||||
new_name = new_name.replace("%l", authors[-1].split(',')[0].strip())
|
||||
new_name = new_name.replace("%a", ', '.join([i.split(',')[0].strip()
|
||||
@ -304,6 +319,10 @@ def addFile(src, filetype):
|
||||
new_name = False
|
||||
sys.exit("Unable to move file to library dir " + params.folder+".")
|
||||
|
||||
# Remove first page of IOP papers
|
||||
if 'IOP' in bibtex['publisher'] and bibtex['type'] == 'article':
|
||||
tearpages.tearpage(new_name)
|
||||
|
||||
bibtexAppend(bibtex)
|
||||
return new_name
|
||||
|
||||
@ -313,7 +332,7 @@ def deleteId(ident):
|
||||
Delete a file based on its id in the bibtex file
|
||||
"""
|
||||
with open(params.folder+'index.bib', 'r') as fh:
|
||||
bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
|
||||
bibtex = BibTexParser(fh)
|
||||
bibtex = bibtex.get_entry_dict()
|
||||
|
||||
if ident not in bibtex.keys():
|
||||
@ -334,7 +353,7 @@ def deleteFile(filename):
|
||||
Delete a file based on its filename
|
||||
"""
|
||||
with open(params.folder+'index.bib', 'r') as fh:
|
||||
bibtex = BibTexParser(fh, customization=homogeneize_latex_encoding)
|
||||
bibtex = BibTexParser(fh)
|
||||
bibtex = bibtex.get_entry_dict()
|
||||
|
||||
found = False
|
||||
@ -353,10 +372,10 @@ def deleteFile(filename):
|
||||
|
||||
|
||||
def downloadFile(url, filetype):
|
||||
dl = fetcher.download_url(url)
|
||||
dl, contenttype = fetcher.download_url(url)
|
||||
|
||||
if dl is not False:
|
||||
tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
|
||||
tmp = tempfile.NamedTemporaryFile(suffix='.'+contenttype)
|
||||
|
||||
with open(tmp.name, 'w+') as fh:
|
||||
fh.write(dl)
|
||||
@ -398,13 +417,17 @@ if __name__ == '__main__':
|
||||
|
||||
new_name = addFile(sys.argv[2], filetype)
|
||||
if new_name is not False:
|
||||
print("File " + src + " successfully imported as "+new_name+".")
|
||||
print(sys.argv[2]+ " successfully imported as "+new_name+".")
|
||||
sys.exit()
|
||||
|
||||
elif sys.argv[1] == 'delete':
|
||||
if len(sys.argv) < 3:
|
||||
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
|
||||
|
||||
confirm = rawInput("Are you sure you want to delete "+sys.argv[2] +
|
||||
" ? [y/N] ")
|
||||
|
||||
if confirm.lower() == 'y':
|
||||
if not deleteId(sys.argv[2]):
|
||||
if not deleteFile(sys.argv[2]):
|
||||
warning("Unable to delete "+sys.argv[2])
|
||||
|
69
tearpages.py
Normal file
69
tearpages.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Francois Boulogne
|
||||
# License: GPLv3
|
||||
|
||||
__version__ = '0.1'
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import tempfile
|
||||
from PyPDF2 import PdfFileWriter, PdfFileReader
|
||||
from PyPDF2.utils import PdfReadError
|
||||
|
||||
|
||||
def fixPdf(pdfFile, destination):
|
||||
"""
|
||||
Fix malformed pdf files when data are present after '%%EOF'
|
||||
|
||||
:param pdfFile: PDF filepath
|
||||
:param destination: destination
|
||||
"""
|
||||
tmp = tempfile.NamedTemporaryFile()
|
||||
output = open(tmp.name, 'wb')
|
||||
with open(pdfFile, "rb") as fh:
|
||||
with open(pdfFile, "rb") as fh:
|
||||
for line in fh:
|
||||
output.write(line)
|
||||
if b'%%EOF' in line:
|
||||
break
|
||||
output.close()
|
||||
shutil.copy(tmp.name, destination)
|
||||
|
||||
def tearpage(filename):
|
||||
"""
|
||||
Copy filename to a tempfile, write pages 1..N to filename.
|
||||
|
||||
:param filename: PDF filepath
|
||||
"""
|
||||
# Copy the pdf to a tmp file
|
||||
tmp = tempfile.NamedTemporaryFile()
|
||||
shutil.copy(filename, tmp.name)
|
||||
|
||||
# Read the copied pdf
|
||||
try:
|
||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||
except PdfReadError:
|
||||
fixPdf(filename, tmp.name)
|
||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||
# Seek for the number of pages
|
||||
num_pages = input_file.getNumPages()
|
||||
|
||||
# Write pages excepted the first one
|
||||
output_file = PdfFileWriter()
|
||||
for i in range(0, num_pages):
|
||||
output_file.addPage(input_file.getPage(i))
|
||||
|
||||
tmp.close()
|
||||
outputStream = open(filename, "wb")
|
||||
output_file.write(outputStream)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Remove the first page of a PDF',
|
||||
epilog='')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('pdf', metavar='PDF', help='PDF filepath')
|
||||
args = parser.parse_args()
|
||||
|
||||
tearpage(args.pdf)
|
Loading…
Reference in New Issue
Block a user