attempt at multi URL downloads

2013-01-10 21:08:36 -08:00 · 2013-01-10 21:08:36 -08:00 · e4074d2b3d
commit e4074d2b3d
parent f186b7d009
1 changed files with 71 additions and 70 deletions
--- a/modules/papers.py
+++ b/modules/papers.py
@ -1,7 +1,7 @@
 """
 Fetches papers.
 """
-
+import re
 import os
 import json
 import random
@ -39,91 +39,92 @@ def download(phenny, input, verbose=True):
    # don't bother if there's nothing there
    if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
        return
 	for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
 		translation_url = "http://localhost:1969/web"
-    translation_url = "http://localhost:1969/web"
+		headers = {
 			"Content-Type": "application/json",
 		}
-    headers = {
+		data = {
-        "Content-Type": "application/json",
+			"url": line,
-    }
+			"sessionid": "what"
 		}
-    data = {
+		data = json.dumps(data)
        "url": line,
        "sessionid": "what"
    }
-    data = json.dumps(data)
+		response = requests.post(translation_url, data=data, headers=headers)
-    response = requests.post(translation_url, data=data, headers=headers)
+		if response.status_code == 200:
 			# see if there are any attachments
 			content = json.loads(response.content)
 			item = content[0]
 			title = item["title"]
-    if response.status_code == 200:
+			if item.has_key("attachments"):
-        # see if there are any attachments
+				pdf_url = None
-        content = json.loads(response.content)
+				for attachment in item["attachments"]:
-        item = content[0]
+					if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
-        title = item["title"]
+						pdf_url = attachment["url"]
 						break
-        if item.has_key("attachments"):
+				if pdf_url:
-            pdf_url = None
+					user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
            for attachment in item["attachments"]:
                if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
                    pdf_url = attachment["url"]
                    break
-            if pdf_url:
+					headers = {
-                user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
+						"User-Agent": user_agent,
 					}
-                headers = {
+					response = None
-                    "User-Agent": user_agent,
+					if pdf_url.startswith("https://"):
-                }
+						response = requests.get(pdf_url, headers=headers, verify=False)
 					else:
 						response = requests.get(pdf_url, headers=headers)
-                response = None
+					# detect failure
-                if pdf_url.startswith("https://"):
+					if response.status_code == 401:
-                    response = requests.get(pdf_url, headers=headers, verify=False)
+						phenny.say("HTTP 401 unauthorized " + str(pdf_url))
-                else:
+						continue
-                    response = requests.get(pdf_url, headers=headers)
+					elif response.status_code != 200:
 						phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
 						continue
-                # detect failure
+					data = response.content
                if response.status_code == 401:
                    phenny.say("HTTP 401 unauthorized " + str(pdf_url))
                    return
                elif response.status_code != 200:
                    phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
                    return
-                data = response.content
+					# grr..
 					title = title.encode("ascii", "ignore")
-                # grr..
+					path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")
                title = title.encode("ascii", "ignore")
-                path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")
+					file_handler = open(path, "w")
 					file_handler.write(data)
 					file_handler.close()
-                file_handler = open(path, "w")
+					filename = requests.utils.quote(title)
-                file_handler.write(data)
+					url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"
                file_handler.close()
-                filename = requests.utils.quote(title)
+					phenny.say(url)
-                url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"
+					continue
-
+				elif verbose and explicit:
-                phenny.say(url)
+					phenny.say("error: didn't find any pdfs on " + line)
-                return
+					phenny.say(download_url(line))
-            elif verbose and explicit:
+					continue
-                phenny.say("error: didn't find any pdfs on " + line)
+			elif verbose and explicit:
-                phenny.say(download_url(line))
+				phenny.say("error: dunno how to find the pdf on " + line)
-                return
+				phenny.say(download_url(line))
-        elif verbose and explicit:
+				continue
-            phenny.say("error: dunno how to find the pdf on " + line)
+		elif verbose and explicit:
-            phenny.say(download_url(line))
+			if response.status_code == 501:
-            return
+				if verbose:
-    elif verbose and explicit:
+					phenny.say("no translator available, raw dump: " + download_url(line))
-        if response.status_code == 501:
+					continue
-            if verbose:
+			else:
-                phenny.say("no translator available, raw dump: " + download_url(line))
+				if verbose:
-                return
+					phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))
-        else:
+					continue
-            if verbose:
+		else:
-                phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))
+			continue
-                return
+	return
    else:
        return
 download.commands = ["fetch", "get", "download"]
 download.priority = "high"
 download.rule = r'(.*)'