fix sciencedirect.com parsing

This commit is contained in:
Bryan Bishop 2013-01-20 22:19:57 -06:00
parent cf7c1b78e1
commit a89129b424

View File

@ -169,7 +169,10 @@ def download_url(url):
extension = ".pdf" extension = ".pdf"
title = citation_title title = citation_title
else: else:
if "h1 class=\"articleTitle" in content: if "sciencedirect.com" in url:
title = tree.xpath("//h1[@class='svTitle']")[0].text
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
elif "h1 class=\"articleTitle" in content:
try: try:
title = tree.xpath("//h1[@class='articleTitle']")[0].text title = tree.xpath("//h1[@class='articleTitle']")[0].text
title = title.encode("ascii", "ignore") title = title.encode("ascii", "ignore")