better support for aip.org
This commit is contained in:
parent
24fcf41760
commit
120f8fbfc0
@ -164,6 +164,7 @@ def download_url(url):
|
|||||||
citation_pdf_url = find_citation_pdf_url(tree, url)
|
citation_pdf_url = find_citation_pdf_url(tree, url)
|
||||||
citation_title = find_citation_title(tree)
|
citation_title = find_citation_title(tree)
|
||||||
|
|
||||||
|
# wow, this seriously needs to be cleaned up
|
||||||
if citation_pdf_url and citation_title:
|
if citation_pdf_url and citation_title:
|
||||||
citation_title = citation_title.encode("ascii", "ignore")
|
citation_title = citation_title.encode("ascii", "ignore")
|
||||||
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
|
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
|
||||||
@ -185,6 +186,19 @@ def download_url(url):
|
|||||||
else:
|
else:
|
||||||
content = new_content
|
content = new_content
|
||||||
response = new_response
|
response = new_response
|
||||||
|
elif "apl.aip.org" in url:
|
||||||
|
try:
|
||||||
|
title = tree.xpath("//title/text()")[0].split(" | ")[0]
|
||||||
|
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
|
||||||
|
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
|
||||||
|
new_content = new_response.content
|
||||||
|
if "pdf" in new_response.headers["content-type"]:
|
||||||
|
extension = ".pdf"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
content = new_content
|
||||||
|
response = new_response
|
||||||
elif "h1 class=\"articleTitle" in content:
|
elif "h1 class=\"articleTitle" in content:
|
||||||
try:
|
try:
|
||||||
title = tree.xpath("//h1[@class='articleTitle']")[0].text
|
title = tree.xpath("//h1[@class='articleTitle']")[0].text
|
||||||
|
Loading…
Reference in New Issue
Block a user