diff --git a/modules/papers.py b/modules/papers.py index 35f51ab..5ee0fdc 100644 --- a/modules/papers.py +++ b/modules/papers.py @@ -6,6 +6,8 @@ import os import json import random import requests +import lxml.etree +from StringIO import StringIO def download(phenny, input, verbose=True): """ @@ -143,15 +145,76 @@ def download_url(url): response = requests.get(url, headers={"User-Agent": "origami-pdf"}) content = response.content + # just make up a default filename title = "%0.2x" % random.getrandbits(128) - path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title) + # default extension + extension = ".txt" + + if "pdf" in response.headers["content-type"]: + extension = ".pdf" + elif check_if_html(response): + # parse the html string with lxml.etree + tree = parse_html(content) + + # extract some metadata with xpaths + citation_pdf_url = find_citation_pdf_url(tree, url) + citation_title = find_citation_title(tree) + + if citation_pdf_url and content_title: + response = requests.get(citation_pdf_url, headers={"User-Agent": "gundam-gdf"}) + content = response.content + if "pdf" in response.headers["content-type"]: + extension = ".pdf" + title = citation_title + else: + raise Exception("problem with citation_pdf_url or content_title") + + path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension) file_handler = open(path, "w") file_handler.write(content) file_handler.close() - url = "http://diyhpl.us/~bryan/papers2/paperbot/" + title + url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension return url +def parse_html(content): + if not isinstance(content, StringIO): + content = StringIO(content) + parser = lxml.etree.HTMLParser() + tree = lxml.etree.parse(content, parser) + return tree + +def check_if_html(response): + return "text/html" in response.headers["content-type"] + +def find_citation_pdf_url(tree, url): + """ + Returns the content attribute. + """ + citation_pdf_url = extract_meta_content(tree, "citation_pdf_url") + if not citation_pdf_url.startswith("http"): + if citation_pdf_url.startswith("/"): + url_start = url[:url.find("/",8)] + citation_pdf_url = url_start + citation_pdf_url + else: + raise Exception("unhandled situation (citation_pdf_url)") + return citation_pdf_url + +def find_citation_title(tree): + """ + Returns the content attribute. + """ + citation_title = extract_meta_content(tree, "citation_title") + return citation_title + +def extract_meta_content(tree, meta_name): + try: + content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0] + except: + return None + else: + return content +