From 357e268e963c869102f4f9e8de01ab85cf8d6daf Mon Sep 17 00:00:00 2001 From: Bryan Bishop Date: Sat, 9 Feb 2013 07:41:50 -0600 Subject: [PATCH] use pdfparanoia to remove watermarks --- modules/papers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/papers.py b/modules/papers.py index e4f67a4..22b88bd 100644 --- a/modules/papers.py +++ b/modules/papers.py @@ -9,6 +9,8 @@ import requests import lxml.etree from StringIO import StringIO +import pdfparanoia + def download(phenny, input, verbose=True): """ Downloads a paper. @@ -202,6 +204,7 @@ def download_url(url): new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: + new_content = pdfparanoia.scrub(new_content) extension = ".pdf" except Exception: pass @@ -244,6 +247,9 @@ def download_url(url): path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension) + if extension in [".pdf", "pdf"]: + content = pdfparanoia.scrub(content) + file_handler = open(path, "w") file_handler.write(content) file_handler.close()