attempt pdfs when zotero fails me

This commit is contained in:
Bryan Bishop 2013-01-16 02:19:09 -06:00
parent 6dcb23e2f8
commit 2748d54bd4
1 changed files with 65 additions and 2 deletions

View File

@ -6,6 +6,8 @@ import os
import json
import random
import requests
import lxml.etree
from StringIO import StringIO
def download(phenny, input, verbose=True):
"""
@ -143,15 +145,76 @@ def download_url(url):
response = requests.get(url, headers={"User-Agent": "origami-pdf"})
content = response.content
# just make up a default filename
title = "%0.2x" % random.getrandbits(128)
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title)
# default extension
extension = ".txt"
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
elif check_if_html(response):
# parse the html string with lxml.etree
tree = parse_html(content)
# extract some metadata with xpaths
citation_pdf_url = find_citation_pdf_url(tree, url)
citation_title = find_citation_title(tree)
if citation_pdf_url and content_title:
response = requests.get(citation_pdf_url, headers={"User-Agent": "gundam-gdf"})
content = response.content
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
title = citation_title
else:
raise Exception("problem with citation_pdf_url or content_title")
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)
file_handler = open(path, "w")
file_handler.write(content)
file_handler.close()
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + title
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension
return url
def parse_html(content):
if not isinstance(content, StringIO):
content = StringIO(content)
parser = lxml.etree.HTMLParser()
tree = lxml.etree.parse(content, parser)
return tree
def check_if_html(response):
return "text/html" in response.headers["content-type"]
def find_citation_pdf_url(tree, url):
"""
Returns the <meta name="citation_pdf_url"> content attribute.
"""
citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
if not citation_pdf_url.startswith("http"):
if citation_pdf_url.startswith("/"):
url_start = url[:url.find("/",8)]
citation_pdf_url = url_start + citation_pdf_url
else:
raise Exception("unhandled situation (citation_pdf_url)")
return citation_pdf_url
def find_citation_title(tree):
"""
Returns the <meta name="citation_title"> content attribute.
"""
citation_title = extract_meta_content(tree, "citation_title")
return citation_title
def extract_meta_content(tree, meta_name):
try:
content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
except:
return None
else:
return content