better support for aip.org

This commit is contained in:
Bryan Bishop 2013-02-06 19:57:42 -06:00
parent 24fcf41760
commit 120f8fbfc0

View File

@ -164,6 +164,7 @@ def download_url(url):
citation_pdf_url = find_citation_pdf_url(tree, url) citation_pdf_url = find_citation_pdf_url(tree, url)
citation_title = find_citation_title(tree) citation_title = find_citation_title(tree)
# wow, this seriously needs to be cleaned up
if citation_pdf_url and citation_title: if citation_pdf_url and citation_title:
citation_title = citation_title.encode("ascii", "ignore") citation_title = citation_title.encode("ascii", "ignore")
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"}) response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
@ -185,6 +186,19 @@ def download_url(url):
else: else:
content = new_content content = new_content
response = new_response response = new_response
elif "apl.aip.org" in url:
try:
title = tree.xpath("//title/text()")[0].split(" | ")[0]
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
except Exception:
pass
else:
content = new_content
response = new_response
elif "h1 class=\"articleTitle" in content: elif "h1 class=\"articleTitle" in content:
try: try:
title = tree.xpath("//h1[@class='articleTitle']")[0].text title = tree.xpath("//h1[@class='articleTitle']")[0].text