attempt at multi URL downloads
This commit is contained in:
parent
f186b7d009
commit
e4074d2b3d
@ -1,7 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Fetches papers.
|
Fetches papers.
|
||||||
"""
|
"""
|
||||||
|
import re
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
@ -39,91 +39,92 @@ def download(phenny, input, verbose=True):
|
|||||||
# don't bother if there's nothing there
|
# don't bother if there's nothing there
|
||||||
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
|
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
|
||||||
return
|
return
|
||||||
|
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
|
||||||
|
translation_url = "http://localhost:1969/web"
|
||||||
|
|
||||||
translation_url = "http://localhost:1969/web"
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
headers = {
|
data = {
|
||||||
"Content-Type": "application/json",
|
"url": line,
|
||||||
}
|
"sessionid": "what"
|
||||||
|
}
|
||||||
|
|
||||||
data = {
|
data = json.dumps(data)
|
||||||
"url": line,
|
|
||||||
"sessionid": "what"
|
|
||||||
}
|
|
||||||
|
|
||||||
data = json.dumps(data)
|
response = requests.post(translation_url, data=data, headers=headers)
|
||||||
|
|
||||||
response = requests.post(translation_url, data=data, headers=headers)
|
if response.status_code == 200:
|
||||||
|
# see if there are any attachments
|
||||||
|
content = json.loads(response.content)
|
||||||
|
item = content[0]
|
||||||
|
title = item["title"]
|
||||||
|
|
||||||
if response.status_code == 200:
|
if item.has_key("attachments"):
|
||||||
# see if there are any attachments
|
pdf_url = None
|
||||||
content = json.loads(response.content)
|
for attachment in item["attachments"]:
|
||||||
item = content[0]
|
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
|
||||||
title = item["title"]
|
pdf_url = attachment["url"]
|
||||||
|
break
|
||||||
|
|
||||||
if item.has_key("attachments"):
|
if pdf_url:
|
||||||
pdf_url = None
|
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
|
||||||
for attachment in item["attachments"]:
|
|
||||||
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
|
|
||||||
pdf_url = attachment["url"]
|
|
||||||
break
|
|
||||||
|
|
||||||
if pdf_url:
|
headers = {
|
||||||
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
|
"User-Agent": user_agent,
|
||||||
|
}
|
||||||
|
|
||||||
headers = {
|
response = None
|
||||||
"User-Agent": user_agent,
|
if pdf_url.startswith("https://"):
|
||||||
}
|
response = requests.get(pdf_url, headers=headers, verify=False)
|
||||||
|
else:
|
||||||
|
response = requests.get(pdf_url, headers=headers)
|
||||||
|
|
||||||
response = None
|
# detect failure
|
||||||
if pdf_url.startswith("https://"):
|
if response.status_code == 401:
|
||||||
response = requests.get(pdf_url, headers=headers, verify=False)
|
phenny.say("HTTP 401 unauthorized " + str(pdf_url))
|
||||||
else:
|
continue
|
||||||
response = requests.get(pdf_url, headers=headers)
|
elif response.status_code != 200:
|
||||||
|
phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
|
||||||
|
continue
|
||||||
|
|
||||||
# detect failure
|
data = response.content
|
||||||
if response.status_code == 401:
|
|
||||||
phenny.say("HTTP 401 unauthorized " + str(pdf_url))
|
|
||||||
return
|
|
||||||
elif response.status_code != 200:
|
|
||||||
phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
|
|
||||||
return
|
|
||||||
|
|
||||||
data = response.content
|
# grr..
|
||||||
|
title = title.encode("ascii", "ignore")
|
||||||
|
|
||||||
# grr..
|
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")
|
||||||
title = title.encode("ascii", "ignore")
|
|
||||||
|
|
||||||
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")
|
file_handler = open(path, "w")
|
||||||
|
file_handler.write(data)
|
||||||
|
file_handler.close()
|
||||||
|
|
||||||
file_handler = open(path, "w")
|
filename = requests.utils.quote(title)
|
||||||
file_handler.write(data)
|
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"
|
||||||
file_handler.close()
|
|
||||||
|
|
||||||
filename = requests.utils.quote(title)
|
phenny.say(url)
|
||||||
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"
|
continue
|
||||||
|
elif verbose and explicit:
|
||||||
phenny.say(url)
|
phenny.say("error: didn't find any pdfs on " + line)
|
||||||
return
|
phenny.say(download_url(line))
|
||||||
elif verbose and explicit:
|
continue
|
||||||
phenny.say("error: didn't find any pdfs on " + line)
|
elif verbose and explicit:
|
||||||
phenny.say(download_url(line))
|
phenny.say("error: dunno how to find the pdf on " + line)
|
||||||
return
|
phenny.say(download_url(line))
|
||||||
elif verbose and explicit:
|
continue
|
||||||
phenny.say("error: dunno how to find the pdf on " + line)
|
elif verbose and explicit:
|
||||||
phenny.say(download_url(line))
|
if response.status_code == 501:
|
||||||
return
|
if verbose:
|
||||||
elif verbose and explicit:
|
phenny.say("no translator available, raw dump: " + download_url(line))
|
||||||
if response.status_code == 501:
|
continue
|
||||||
if verbose:
|
else:
|
||||||
phenny.say("no translator available, raw dump: " + download_url(line))
|
if verbose:
|
||||||
return
|
phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))
|
||||||
else:
|
continue
|
||||||
if verbose:
|
else:
|
||||||
phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))
|
continue
|
||||||
return
|
return
|
||||||
else:
|
|
||||||
return
|
|
||||||
download.commands = ["fetch", "get", "download"]
|
download.commands = ["fetch", "get", "download"]
|
||||||
download.priority = "high"
|
download.priority = "high"
|
||||||
download.rule = r'(.*)'
|
download.rule = r'(.*)'
|
||||||
|
Loading…
Reference in New Issue
Block a user