attempt at multi URL downloads

This commit is contained in:
Nathan McCorkle 2013-01-10 21:08:36 -08:00
parent f186b7d009
commit e4074d2b3d
1 changed files with 71 additions and 70 deletions

View File

@ -1,7 +1,7 @@
""" """
Fetches papers. Fetches papers.
""" """
import re
import os import os
import json import json
import random import random
@ -39,91 +39,92 @@ def download(phenny, input, verbose=True):
# don't bother if there's nothing there # don't bother if there's nothing there
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"): if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
return return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
translation_url = "http://localhost:1969/web"
translation_url = "http://localhost:1969/web" headers = {
"Content-Type": "application/json",
}
headers = { data = {
"Content-Type": "application/json", "url": line,
} "sessionid": "what"
}
data = { data = json.dumps(data)
"url": line,
"sessionid": "what"
}
data = json.dumps(data) response = requests.post(translation_url, data=data, headers=headers)
response = requests.post(translation_url, data=data, headers=headers) if response.status_code == 200:
# see if there are any attachments
content = json.loads(response.content)
item = content[0]
title = item["title"]
if response.status_code == 200: if item.has_key("attachments"):
# see if there are any attachments pdf_url = None
content = json.loads(response.content) for attachment in item["attachments"]:
item = content[0] if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
title = item["title"] pdf_url = attachment["url"]
break
if item.has_key("attachments"): if pdf_url:
pdf_url = None user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
for attachment in item["attachments"]:
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
pdf_url = attachment["url"]
break
if pdf_url: headers = {
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" "User-Agent": user_agent,
}
headers = { response = None
"User-Agent": user_agent, if pdf_url.startswith("https://"):
} response = requests.get(pdf_url, headers=headers, verify=False)
else:
response = requests.get(pdf_url, headers=headers)
response = None # detect failure
if pdf_url.startswith("https://"): if response.status_code == 401:
response = requests.get(pdf_url, headers=headers, verify=False) phenny.say("HTTP 401 unauthorized " + str(pdf_url))
else: continue
response = requests.get(pdf_url, headers=headers) elif response.status_code != 200:
phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
continue
# detect failure data = response.content
if response.status_code == 401:
phenny.say("HTTP 401 unauthorized " + str(pdf_url))
return
elif response.status_code != 200:
phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
return
data = response.content # grr..
title = title.encode("ascii", "ignore")
# grr.. path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")
title = title.encode("ascii", "ignore")
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf") file_handler = open(path, "w")
file_handler.write(data)
file_handler.close()
file_handler = open(path, "w") filename = requests.utils.quote(title)
file_handler.write(data) url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"
file_handler.close()
filename = requests.utils.quote(title) phenny.say(url)
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf" continue
elif verbose and explicit:
phenny.say(url) phenny.say("error: didn't find any pdfs on " + line)
return phenny.say(download_url(line))
elif verbose and explicit: continue
phenny.say("error: didn't find any pdfs on " + line) elif verbose and explicit:
phenny.say(download_url(line)) phenny.say("error: dunno how to find the pdf on " + line)
return phenny.say(download_url(line))
elif verbose and explicit: continue
phenny.say("error: dunno how to find the pdf on " + line) elif verbose and explicit:
phenny.say(download_url(line)) if response.status_code == 501:
return if verbose:
elif verbose and explicit: phenny.say("no translator available, raw dump: " + download_url(line))
if response.status_code == 501: continue
if verbose: else:
phenny.say("no translator available, raw dump: " + download_url(line)) if verbose:
return phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))
else: continue
if verbose: else:
phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line)) continue
return return
else:
return
download.commands = ["fetch", "get", "download"] download.commands = ["fetch", "get", "download"]
download.priority = "high" download.priority = "high"
download.rule = r'(.*)' download.rule = r'(.*)'