From 191f00ad9f881c9a0adde6e8e48867df15281a30 Mon Sep 17 00:00:00 2001 From: Bryan Bishop Date: Wed, 16 Jan 2013 02:53:18 -0600 Subject: [PATCH] fix some bugs for pubs.acs.org --- modules/papers.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/papers.py b/modules/papers.py index 0e66b2d..f3c62c4 100644 --- a/modules/papers.py +++ b/modules/papers.py @@ -167,7 +167,24 @@ def download_url(url): extension = ".pdf" title = citation_title else: - raise Exception("problem with citation_pdf_url or citation_title") + if "h1 class=\"articleTitle" in content: + try: + title = tree.xpath("//h1[class='articleTitle']")[0].text + title = title.encode("ascii", "ignore") + pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0] + except: + pass + else: + if pdf_url.startswith("/"): + url_start = url[:url.find("/",8)] + pdf_url = url_start + pdf_url + response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"}) + content = response.content + if "pdf" in response.headers["content-type"]: + extension = ".pdf" + # raise Exception("problem with citation_pdf_url or citation_title") + # well, at least save the contents from the original url + pass path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)