2013-05-11 11:57:28 +02:00
|
|
|
#!/usr/bin/python
|
2013-01-08 07:27:46 +01:00
|
|
|
"""
|
|
|
|
Fetches papers.
|
|
|
|
"""
|
2013-01-11 06:08:36 +01:00
|
|
|
import re
|
2013-01-08 07:27:46 +01:00
|
|
|
import os
|
|
|
|
import json
|
2013-05-11 11:57:28 +02:00
|
|
|
import params
|
2013-01-10 19:16:55 +01:00
|
|
|
import random
|
2013-01-08 07:27:46 +01:00
|
|
|
import requests
|
2013-01-16 09:19:09 +01:00
|
|
|
import lxml.etree
|
2013-05-11 11:57:28 +02:00
|
|
|
import sys
|
2013-01-16 09:19:09 +01:00
|
|
|
from StringIO import StringIO
|
2013-01-08 07:27:46 +01:00
|
|
|
|
2013-02-09 14:41:50 +01:00
|
|
|
import pdfparanoia
|
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
def download(line, verbose=True):
|
2013-01-08 07:27:46 +01:00
|
|
|
"""
|
|
|
|
Downloads a paper.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# don't bother if there's nothing there
|
|
|
|
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
|
|
|
|
return
|
2013-01-11 06:14:16 +01:00
|
|
|
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
|
2013-01-20 02:33:17 +01:00
|
|
|
line = filter_fix(line)
|
|
|
|
|
2013-01-22 02:11:12 +01:00
|
|
|
# fix for login.jsp links to ieee xplore
|
|
|
|
line = fix_ieee_login_urls(line)
|
2013-02-22 00:13:22 +01:00
|
|
|
line = fix_jstor_pdf_urls(line)
|
2013-01-22 02:11:12 +01:00
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
translation_url = params.server
|
2013-01-11 06:14:16 +01:00
|
|
|
|
|
|
|
headers = {
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
}
|
|
|
|
|
|
|
|
data = {
|
|
|
|
"url": line,
|
|
|
|
"sessionid": "what"
|
|
|
|
}
|
|
|
|
|
|
|
|
data = json.dumps(data)
|
|
|
|
|
|
|
|
response = requests.post(translation_url, data=data, headers=headers)
|
|
|
|
|
2013-02-18 12:17:57 +01:00
|
|
|
if response.status_code == 200 and response.content != "[]":
|
2013-01-11 06:14:16 +01:00
|
|
|
# see if there are any attachments
|
|
|
|
content = json.loads(response.content)
|
|
|
|
item = content[0]
|
|
|
|
title = item["title"]
|
|
|
|
|
|
|
|
if item.has_key("attachments"):
|
|
|
|
pdf_url = None
|
|
|
|
for attachment in item["attachments"]:
|
|
|
|
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
|
|
|
|
pdf_url = attachment["url"]
|
|
|
|
break
|
|
|
|
|
|
|
|
if pdf_url:
|
|
|
|
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
"User-Agent": user_agent,
|
|
|
|
}
|
|
|
|
|
|
|
|
response = None
|
|
|
|
if pdf_url.startswith("https://"):
|
|
|
|
response = requests.get(pdf_url, headers=headers, verify=False)
|
|
|
|
else:
|
|
|
|
response = requests.get(pdf_url, headers=headers)
|
|
|
|
|
|
|
|
# detect failure
|
|
|
|
if response.status_code == 401:
|
2013-05-11 11:57:28 +02:00
|
|
|
print("HTTP 401 unauthorized " + str(pdf_url))
|
2013-01-11 06:14:16 +01:00
|
|
|
continue
|
|
|
|
elif response.status_code != 200:
|
2013-05-11 11:57:28 +02:00
|
|
|
print("HTTP " + str(response.status_code) + " " + str(pdf_url))
|
2013-01-11 06:14:16 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
data = response.content
|
|
|
|
|
2013-02-09 14:42:57 +01:00
|
|
|
if "pdf" in response.headers["content-type"]:
|
2013-02-11 17:03:28 +01:00
|
|
|
try:
|
|
|
|
data = pdfparanoia.scrub(StringIO(data))
|
|
|
|
except:
|
|
|
|
# this is to avoid a PDFNotImplementedError
|
|
|
|
pass
|
2013-02-09 14:42:57 +01:00
|
|
|
|
2013-01-11 06:14:16 +01:00
|
|
|
# grr..
|
|
|
|
title = title.encode("ascii", "ignore")
|
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
path = os.path.join(params.folder, title + ".pdf")
|
2013-01-11 06:14:16 +01:00
|
|
|
|
|
|
|
file_handler = open(path, "w")
|
|
|
|
file_handler.write(data)
|
|
|
|
file_handler.close()
|
|
|
|
|
|
|
|
filename = requests.utils.quote(title)
|
2013-02-10 02:22:51 +01:00
|
|
|
|
|
|
|
# Remove an ending period, which sometimes happens when the
|
|
|
|
# title of the paper has a period at the end.
|
|
|
|
if filename[-1] == ".":
|
|
|
|
filename = filename[:-1]
|
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
url = params.url + filename + ".pdf"
|
2013-01-11 06:14:16 +01:00
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
print(url)
|
2013-01-11 06:14:16 +01:00
|
|
|
continue
|
2013-05-11 11:57:28 +02:00
|
|
|
else:
|
|
|
|
print(download_url(line))
|
2013-01-11 06:14:16 +01:00
|
|
|
continue
|
2013-05-11 11:57:28 +02:00
|
|
|
else:
|
|
|
|
print(download_url(line))
|
2013-01-11 06:14:16 +01:00
|
|
|
continue
|
2013-05-11 11:57:28 +02:00
|
|
|
else:
|
2013-01-11 06:14:16 +01:00
|
|
|
if response.status_code == 501:
|
|
|
|
if verbose:
|
2013-05-11 11:57:28 +02:00
|
|
|
print("no translator available, raw dump: " + download_url(line))
|
2013-01-11 06:14:16 +01:00
|
|
|
else:
|
|
|
|
if verbose:
|
2013-05-11 11:57:28 +02:00
|
|
|
print("error: HTTP " + str(response.status_code) + " " + download_url(line))
|
2013-01-11 06:14:16 +01:00
|
|
|
return
|
2013-01-08 07:27:46 +01:00
|
|
|
download.commands = ["fetch", "get", "download"]
|
|
|
|
download.priority = "high"
|
|
|
|
download.rule = r'(.*)'
|
|
|
|
|
2013-01-10 19:16:55 +01:00
|
|
|
def download_ieee(url):
|
|
|
|
"""
|
|
|
|
Downloads an IEEE paper. The Zotero translator requires frames/windows to
|
|
|
|
be available. Eventually translation-server will be fixed, but until then
|
|
|
|
it might be nice to have an IEEE workaround.
|
|
|
|
"""
|
|
|
|
# url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"
|
|
|
|
# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def download_url(url):
|
|
|
|
response = requests.get(url, headers={"User-Agent": "origami-pdf"})
|
|
|
|
content = response.content
|
2013-01-11 00:14:41 +01:00
|
|
|
|
2013-01-16 09:19:09 +01:00
|
|
|
# just make up a default filename
|
2013-01-10 19:16:55 +01:00
|
|
|
title = "%0.2x" % random.getrandbits(128)
|
|
|
|
|
2013-01-16 09:19:09 +01:00
|
|
|
# default extension
|
|
|
|
extension = ".txt"
|
|
|
|
|
|
|
|
if "pdf" in response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
elif check_if_html(response):
|
|
|
|
# parse the html string with lxml.etree
|
|
|
|
tree = parse_html(content)
|
|
|
|
|
|
|
|
# extract some metadata with xpaths
|
|
|
|
citation_pdf_url = find_citation_pdf_url(tree, url)
|
|
|
|
citation_title = find_citation_title(tree)
|
|
|
|
|
2013-02-07 03:00:50 +01:00
|
|
|
# aip.org sucks, citation_pdf_url is wrong
|
2013-02-07 10:54:05 +01:00
|
|
|
if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
|
2013-02-07 03:00:50 +01:00
|
|
|
citation_pdf_url = None
|
|
|
|
|
2013-02-08 22:23:29 +01:00
|
|
|
if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
|
2013-02-08 11:45:43 +01:00
|
|
|
content = requests.get(citation_pdf_url).content
|
|
|
|
tree = parse_html(content)
|
2013-02-08 22:23:29 +01:00
|
|
|
# citation_title = ...
|
2013-02-08 11:45:43 +01:00
|
|
|
|
2013-02-07 02:57:42 +01:00
|
|
|
# wow, this seriously needs to be cleaned up
|
2013-02-08 11:45:43 +01:00
|
|
|
if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
|
2013-01-16 09:43:20 +01:00
|
|
|
citation_title = citation_title.encode("ascii", "ignore")
|
2013-01-16 09:25:04 +01:00
|
|
|
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
|
2013-01-16 09:19:09 +01:00
|
|
|
content = response.content
|
|
|
|
if "pdf" in response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
title = citation_title
|
|
|
|
else:
|
2013-01-24 02:58:24 +01:00
|
|
|
if "sciencedirect.com" in url and not "ShoppingCart" in url:
|
2013-01-24 03:01:03 +01:00
|
|
|
try:
|
|
|
|
title = tree.xpath("//h1[@class='svTitle']")[0].text
|
|
|
|
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
|
|
|
|
new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})
|
|
|
|
new_content = new_response.content
|
|
|
|
if "pdf" in new_response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
content = new_content
|
|
|
|
response = new_response
|
2013-02-22 00:11:28 +01:00
|
|
|
elif "jstor.org/" in url:
|
|
|
|
# clean up the url
|
|
|
|
if "?" in url:
|
|
|
|
url = url[0:url.find("?")]
|
|
|
|
|
|
|
|
# not all pages have the <input type="hidden" name="ppv-title"> element
|
|
|
|
try:
|
2013-02-22 00:29:53 +01:00
|
|
|
title = tree.xpath("//div[@class='hd title']")[0].text
|
2013-02-22 00:11:28 +01:00
|
|
|
except Exception:
|
2013-02-22 00:29:53 +01:00
|
|
|
try:
|
|
|
|
title = tree.xpath("//input[@name='ppv-title']/@value")[0]
|
|
|
|
except Exception:
|
|
|
|
pass
|
2013-02-22 00:11:28 +01:00
|
|
|
|
|
|
|
# get the document id
|
|
|
|
document_id = None
|
|
|
|
if url[-1] != "/":
|
|
|
|
#if "stable/" in url:
|
|
|
|
#elif "discover/" in url:
|
|
|
|
#elif "action/showShelf?candidate=" in url:
|
|
|
|
#elif "pss/" in url:
|
|
|
|
document_id = url.split("/")[-1]
|
|
|
|
|
|
|
|
if document_id.isdigit():
|
|
|
|
try:
|
|
|
|
pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
|
2013-02-22 00:29:53 +01:00
|
|
|
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
|
2013-02-22 00:11:28 +01:00
|
|
|
new_content = new_response.content
|
|
|
|
if "pdf" in new_response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
content = new_content
|
|
|
|
response = new_response
|
2013-02-09 15:03:29 +01:00
|
|
|
elif ".aip.org/" in url:
|
2013-02-07 02:57:42 +01:00
|
|
|
try:
|
|
|
|
title = tree.xpath("//title/text()")[0].split(" | ")[0]
|
|
|
|
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
|
|
|
|
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
|
|
|
|
new_content = new_response.content
|
|
|
|
if "pdf" in new_response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
content = new_content
|
|
|
|
response = new_response
|
2013-02-08 11:16:10 +01:00
|
|
|
elif "ieeexplore.ieee.org" in url:
|
|
|
|
try:
|
|
|
|
pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
|
|
|
|
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
|
|
|
|
new_content = new_response.content
|
|
|
|
if "pdf" in new_response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
content = new_content
|
|
|
|
response = new_response
|
2013-01-21 05:19:57 +01:00
|
|
|
elif "h1 class=\"articleTitle" in content:
|
2013-01-16 09:53:18 +01:00
|
|
|
try:
|
2013-01-16 09:56:06 +01:00
|
|
|
title = tree.xpath("//h1[@class='articleTitle']")[0].text
|
2013-01-16 09:53:18 +01:00
|
|
|
title = title.encode("ascii", "ignore")
|
|
|
|
pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
if pdf_url.startswith("/"):
|
|
|
|
url_start = url[:url.find("/",8)]
|
|
|
|
pdf_url = url_start + pdf_url
|
|
|
|
response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
|
|
|
|
content = response.content
|
|
|
|
if "pdf" in response.headers["content-type"]:
|
|
|
|
extension = ".pdf"
|
|
|
|
# raise Exception("problem with citation_pdf_url or citation_title")
|
|
|
|
# well, at least save the contents from the original url
|
|
|
|
pass
|
2013-01-16 09:19:09 +01:00
|
|
|
|
2013-04-15 08:13:49 +02:00
|
|
|
# make the title again just in case
|
|
|
|
if not title:
|
|
|
|
title = "%0.2x" % random.getrandbits(128)
|
|
|
|
|
2013-01-21 18:44:13 +01:00
|
|
|
# can't create directories
|
|
|
|
title = title.replace("/", "_")
|
2013-05-11 11:57:28 +02:00
|
|
|
title = title.replace(" ", "_")
|
|
|
|
title = title[:20]
|
2013-01-21 18:44:13 +01:00
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
path = os.path.join(params.folder, title + extension)
|
2013-01-10 19:16:55 +01:00
|
|
|
|
2013-02-09 14:41:50 +01:00
|
|
|
if extension in [".pdf", "pdf"]:
|
2013-02-11 17:03:28 +01:00
|
|
|
try:
|
|
|
|
content = pdfparanoia.scrub(StringIO(content))
|
|
|
|
except:
|
|
|
|
# this is to avoid a PDFNotImplementedError
|
|
|
|
pass
|
2013-02-09 14:41:50 +01:00
|
|
|
|
2013-01-10 19:16:55 +01:00
|
|
|
file_handler = open(path, "w")
|
|
|
|
file_handler.write(content)
|
|
|
|
file_handler.close()
|
|
|
|
|
2013-02-17 04:23:39 +01:00
|
|
|
title = title.encode("ascii", "ignore")
|
2013-05-11 11:57:28 +02:00
|
|
|
url = params.url + requests.utils.quote(title) + extension
|
2013-01-10 19:16:55 +01:00
|
|
|
|
|
|
|
return url
|
|
|
|
|
2013-01-16 09:19:09 +01:00
|
|
|
def parse_html(content):
|
|
|
|
if not isinstance(content, StringIO):
|
|
|
|
content = StringIO(content)
|
|
|
|
parser = lxml.etree.HTMLParser()
|
|
|
|
tree = lxml.etree.parse(content, parser)
|
|
|
|
return tree
|
|
|
|
|
|
|
|
def check_if_html(response):
|
|
|
|
return "text/html" in response.headers["content-type"]
|
|
|
|
|
|
|
|
def find_citation_pdf_url(tree, url):
|
|
|
|
"""
|
|
|
|
Returns the <meta name="citation_pdf_url"> content attribute.
|
|
|
|
"""
|
|
|
|
citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
|
2013-01-16 09:41:56 +01:00
|
|
|
if citation_pdf_url and not citation_pdf_url.startswith("http"):
|
2013-01-16 09:19:09 +01:00
|
|
|
if citation_pdf_url.startswith("/"):
|
|
|
|
url_start = url[:url.find("/",8)]
|
|
|
|
citation_pdf_url = url_start + citation_pdf_url
|
|
|
|
else:
|
|
|
|
raise Exception("unhandled situation (citation_pdf_url)")
|
|
|
|
return citation_pdf_url
|
|
|
|
|
|
|
|
def find_citation_title(tree):
|
|
|
|
"""
|
|
|
|
Returns the <meta name="citation_title"> content attribute.
|
|
|
|
"""
|
|
|
|
citation_title = extract_meta_content(tree, "citation_title")
|
|
|
|
return citation_title
|
|
|
|
|
|
|
|
def extract_meta_content(tree, meta_name):
|
|
|
|
try:
|
|
|
|
content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
|
|
|
|
except:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
return content
|
|
|
|
|
2013-01-20 02:33:17 +01:00
|
|
|
def filter_fix(url):
|
|
|
|
"""
|
|
|
|
Fixes some common problems in urls.
|
|
|
|
"""
|
|
|
|
if ".proxy.lib.pdx.edu" in url:
|
|
|
|
url = url.replace(".proxy.lib.pdx.edu", "")
|
|
|
|
return url
|
|
|
|
|
2013-01-22 02:11:12 +01:00
|
|
|
def fix_ieee_login_urls(url):
|
|
|
|
"""
|
|
|
|
Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
|
|
|
|
abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
|
|
|
|
page, and then this link is given to paperbot. The actual link is based on
|
|
|
|
the arnumber.
|
|
|
|
|
|
|
|
example:
|
|
|
|
http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
|
|
|
|
"""
|
|
|
|
if "ieeexplore.ieee.org/xpl/login.jsp" in url:
|
|
|
|
if "arnumber=" in url:
|
|
|
|
parts = url.split("arnumber=")
|
|
|
|
|
|
|
|
# i guess the url might not look like the example in the docstring
|
|
|
|
if "&" in parts[1]:
|
|
|
|
arnumber = parts[1].split("&")[0]
|
|
|
|
else:
|
|
|
|
arnumber = parts[1]
|
|
|
|
|
|
|
|
return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber
|
|
|
|
|
|
|
|
# default case when things go wrong
|
|
|
|
return url
|
|
|
|
|
2013-02-22 00:13:22 +01:00
|
|
|
def fix_jstor_pdf_urls(url):
|
|
|
|
"""
|
|
|
|
Fixes urls pointing to jstor pdfs.
|
|
|
|
"""
|
|
|
|
if "jstor.org/" in url:
|
|
|
|
if ".pdf" in url and not "?acceptTC=true" in url:
|
|
|
|
url += "?acceptTC=true"
|
|
|
|
return url
|
|
|
|
|
2013-05-11 11:57:28 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
for a in sys.argv[1:]:
|
|
|
|
download(a)
|
|
|
|
else:
|
|
|
|
while True:
|
|
|
|
l = sys.stdin.readline()
|
|
|
|
if not l:
|
|
|
|
break
|
|
|
|
download(l)
|
|
|
|
|
|
|
|
|