use pdfparanoia to remove watermarks
This commit is contained in:
parent
bef66e1241
commit
357e268e96
@ -9,6 +9,8 @@ import requests
|
|||||||
import lxml.etree
|
import lxml.etree
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
import pdfparanoia
|
||||||
|
|
||||||
def download(phenny, input, verbose=True):
|
def download(phenny, input, verbose=True):
|
||||||
"""
|
"""
|
||||||
Downloads a paper.
|
Downloads a paper.
|
||||||
@ -202,6 +204,7 @@ def download_url(url):
|
|||||||
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
|
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
|
||||||
new_content = new_response.content
|
new_content = new_response.content
|
||||||
if "pdf" in new_response.headers["content-type"]:
|
if "pdf" in new_response.headers["content-type"]:
|
||||||
|
new_content = pdfparanoia.scrub(new_content)
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@ -244,6 +247,9 @@ def download_url(url):
|
|||||||
|
|
||||||
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)
|
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)
|
||||||
|
|
||||||
|
if extension in [".pdf", "pdf"]:
|
||||||
|
content = pdfparanoia.scrub(content)
|
||||||
|
|
||||||
file_handler = open(path, "w")
|
file_handler = open(path, "w")
|
||||||
file_handler.write(content)
|
file_handler.write(content)
|
||||||
file_handler.close()
|
file_handler.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user