attempt pdfs when zotero fails me

2013-01-16 02:19:09 -06:00 · 2013-01-16 02:19:09 -06:00 · 2748d54bd4
commit 2748d54bd4
parent 6dcb23e2f8
1 changed files with 65 additions and 2 deletions
--- a/modules/papers.py
+++ b/modules/papers.py
@ -6,6 +6,8 @@ import os
 import json
 import random
 import requests
+import lxml.etree
+from StringIO import StringIO

 def download(phenny, input, verbose=True):
    """
@ -143,15 +145,76 @@ def download_url(url):
    response = requests.get(url, headers={"User-Agent": "origami-pdf"})
    content = response.content

+    # just make up a default filename
    title = "%0.2x" % random.getrandbits(128)

-    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title)
+    # default extension
+    extension = ".txt"
+
+    if "pdf" in response.headers["content-type"]:
+        extension = ".pdf"
+    elif check_if_html(response):
+        # parse the html string with lxml.etree
+        tree = parse_html(content)
+
+        # extract some metadata with xpaths
+        citation_pdf_url = find_citation_pdf_url(tree, url)
+        citation_title = find_citation_title(tree)
+
+        if citation_pdf_url and content_title:
+            response = requests.get(citation_pdf_url, headers={"User-Agent": "gundam-gdf"})
+            content = response.content
+            if "pdf" in response.headers["content-type"]:
+                extension = ".pdf"
+                title = citation_title
+        else:
+            raise Exception("problem with citation_pdf_url or content_title")
+
+    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)

    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()

-    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + title
+    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension

    return url

+def parse_html(content):
+    if not isinstance(content, StringIO):
+        content = StringIO(content)
+    parser = lxml.etree.HTMLParser()
+    tree = lxml.etree.parse(content, parser)
+    return tree
+
+def check_if_html(response):
+    return "text/html" in response.headers["content-type"]
+
+def find_citation_pdf_url(tree, url):
+    """
+    Returns the <meta name="citation_pdf_url"> content attribute.
+    """
+    citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
+    if not citation_pdf_url.startswith("http"):
+        if citation_pdf_url.startswith("/"):
+            url_start = url[:url.find("/",8)]
+            citation_pdf_url = url_start + citation_pdf_url
+        else:
+            raise Exception("unhandled situation (citation_pdf_url)")
+    return citation_pdf_url
+
+def find_citation_title(tree):
+    """
+    Returns the <meta name="citation_title"> content attribute.
+    """
+    citation_title = extract_meta_content(tree, "citation_title")
+    return citation_title
+
+def extract_meta_content(tree, meta_name):
+    try:
+        content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
+    except:
+        return None
+    else:
+        return content
+