attempt pdfs when zotero fails me
This commit is contained in:
parent
6dcb23e2f8
commit
2748d54bd4
@ -6,6 +6,8 @@ import os
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import requests
|
import requests
|
||||||
|
import lxml.etree
|
||||||
|
from StringIO import StringIO
|
||||||
|
|
||||||
def download(phenny, input, verbose=True):
|
def download(phenny, input, verbose=True):
|
||||||
"""
|
"""
|
||||||
@ -143,15 +145,76 @@ def download_url(url):
|
|||||||
response = requests.get(url, headers={"User-Agent": "origami-pdf"})
|
response = requests.get(url, headers={"User-Agent": "origami-pdf"})
|
||||||
content = response.content
|
content = response.content
|
||||||
|
|
||||||
|
# just make up a default filename
|
||||||
title = "%0.2x" % random.getrandbits(128)
|
title = "%0.2x" % random.getrandbits(128)
|
||||||
|
|
||||||
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title)
|
# default extension
|
||||||
|
extension = ".txt"
|
||||||
|
|
||||||
|
if "pdf" in response.headers["content-type"]:
|
||||||
|
extension = ".pdf"
|
||||||
|
elif check_if_html(response):
|
||||||
|
# parse the html string with lxml.etree
|
||||||
|
tree = parse_html(content)
|
||||||
|
|
||||||
|
# extract some metadata with xpaths
|
||||||
|
citation_pdf_url = find_citation_pdf_url(tree, url)
|
||||||
|
citation_title = find_citation_title(tree)
|
||||||
|
|
||||||
|
if citation_pdf_url and content_title:
|
||||||
|
response = requests.get(citation_pdf_url, headers={"User-Agent": "gundam-gdf"})
|
||||||
|
content = response.content
|
||||||
|
if "pdf" in response.headers["content-type"]:
|
||||||
|
extension = ".pdf"
|
||||||
|
title = citation_title
|
||||||
|
else:
|
||||||
|
raise Exception("problem with citation_pdf_url or content_title")
|
||||||
|
|
||||||
|
path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)
|
||||||
|
|
||||||
file_handler = open(path, "w")
|
file_handler = open(path, "w")
|
||||||
file_handler.write(content)
|
file_handler.write(content)
|
||||||
file_handler.close()
|
file_handler.close()
|
||||||
|
|
||||||
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + title
|
url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def parse_html(content):
|
||||||
|
if not isinstance(content, StringIO):
|
||||||
|
content = StringIO(content)
|
||||||
|
parser = lxml.etree.HTMLParser()
|
||||||
|
tree = lxml.etree.parse(content, parser)
|
||||||
|
return tree
|
||||||
|
|
||||||
|
def check_if_html(response):
|
||||||
|
return "text/html" in response.headers["content-type"]
|
||||||
|
|
||||||
|
def find_citation_pdf_url(tree, url):
|
||||||
|
"""
|
||||||
|
Returns the <meta name="citation_pdf_url"> content attribute.
|
||||||
|
"""
|
||||||
|
citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
|
||||||
|
if not citation_pdf_url.startswith("http"):
|
||||||
|
if citation_pdf_url.startswith("/"):
|
||||||
|
url_start = url[:url.find("/",8)]
|
||||||
|
citation_pdf_url = url_start + citation_pdf_url
|
||||||
|
else:
|
||||||
|
raise Exception("unhandled situation (citation_pdf_url)")
|
||||||
|
return citation_pdf_url
|
||||||
|
|
||||||
|
def find_citation_title(tree):
|
||||||
|
"""
|
||||||
|
Returns the <meta name="citation_title"> content attribute.
|
||||||
|
"""
|
||||||
|
citation_title = extract_meta_content(tree, "citation_title")
|
||||||
|
return citation_title
|
||||||
|
|
||||||
|
def extract_meta_content(tree, meta_name):
|
||||||
|
try:
|
||||||
|
content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return content
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user