possibly better sciencedirect handling

This commit is contained in:
Bryan Bishop 2013-01-23 20:01:03 -06:00
parent f7d7eaa6cb
commit 8b3abe9222

View File

@ -173,12 +173,18 @@ def download_url(url):
title = citation_title title = citation_title
else: else:
if "sciencedirect.com" in url and not "ShoppingCart" in url: if "sciencedirect.com" in url and not "ShoppingCart" in url:
title = tree.xpath("//h1[@class='svTitle']")[0].text try:
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] title = tree.xpath("//h1[@class='svTitle']")[0].text
response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"}) pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
content = response.content new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})
if "pdf" in response.headers["content-type"]: new_content = new_response.content
extension = ".pdf" if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
except Exception:
pass
else:
content = new_content
response = new_response
elif "h1 class=\"articleTitle" in content: elif "h1 class=\"articleTitle" in content:
try: try:
title = tree.xpath("//h1[@class='articleTitle']")[0].text title = tree.xpath("//h1[@class='articleTitle']")[0].text