attempt pdfs when zotero fails me

2013-01-16 02:19:09 -06:00 · 2013-01-16 02:19:09 -06:00 · 2748d54bd4
commit 2748d54bd4
parent 6dcb23e2f8
1 changed files with 65 additions and 2 deletions
--- a/modules/papers.py
+++ b/modules/papers.py
@ -6,6 +6,8 @@ import os
 import json
 import random
 import requests
 import lxml.etree
 from StringIO import StringIO
 def download(phenny, input, verbose=True):
    """
@ -143,15 +145,76 @@ def download_url(url):
    response = requests.get(url, headers={"User-Agent": "origami-pdf"})
    content = response.content
    # just make up a default filename
    title = "%0.2x" % random.getrandbits(128)
-    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title)
+    # default extension
    extension = ".txt"
    if "pdf" in response.headers["content-type"]:
        extension = ".pdf"
    elif check_if_html(response):
        # parse the html string with lxml.etree
        tree = parse_html(content)
        # extract some metadata with xpaths
        citation_pdf_url = find_citation_pdf_url(tree, url)
        citation_title = find_citation_title(tree)
        if citation_pdf_url and content_title:
            response = requests.get(citation_pdf_url, headers={"User-Agent": "gundam-gdf"})
            content = response.content
            if "pdf" in response.headers["content-type"]:
                extension = ".pdf"
                title = citation_title
        else:
            raise Exception("problem with citation_pdf_url or content_title")
    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)
    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()
-    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + title
+    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension
    return url
 def parse_html(content):
    if not isinstance(content, StringIO):
        content = StringIO(content)
    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree
 def check_if_html(response):
    return "text/html" in response.headers["content-type"]
 def find_citation_pdf_url(tree, url):
    """
    Returns the <meta name="citation_pdf_url"> content attribute.
    """
    citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
    if not citation_pdf_url.startswith("http"):
        if citation_pdf_url.startswith("/"):
            url_start = url[:url.find("/",8)]
            citation_pdf_url = url_start + citation_pdf_url
        else:
            raise Exception("unhandled situation (citation_pdf_url)")
    return citation_pdf_url
 def find_citation_title(tree):
    """
    Returns the <meta name="citation_title"> content attribute.
    """
    citation_title = extract_meta_content(tree, "citation_title")
    return citation_title
 def extract_meta_content(tree, meta_name):
    try:
        content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
    except:
        return None
    else:
        return content