bmc/papers.py

465 lines
15 KiB
Python
Raw Normal View History

#!/usr/bin/python -u
# coding=utf8
2013-01-08 07:27:46 +01:00
"""
Fetches papers.
"""
2013-01-11 06:08:36 +01:00
import re
2013-01-08 07:27:46 +01:00
import os
import json
2013-05-11 11:57:28 +02:00
import params
2013-01-10 19:16:55 +01:00
import random
import requesocks as requests
2013-01-16 09:19:09 +01:00
import lxml.etree
2013-05-11 11:57:28 +02:00
import sys
from time import time
2013-01-16 09:19:09 +01:00
from StringIO import StringIO
2013-01-08 07:27:46 +01:00
2013-02-09 14:41:50 +01:00
import pdfparanoia
def download_proxy(line, zotero, proxy, verbose=True):
sys.stderr.write("attempting download of %s through %s and %s\n" %
(line, zotero, proxy))
2013-01-08 07:27:46 +01:00
headers = {
"Content-Type": "application/json",
}
data = {
"url": line,
"sessionid": "what"
}
2013-01-22 02:11:12 +01:00
data = json.dumps(data)
2013-01-11 06:14:16 +01:00
response = requests.post(zotero, data=data, headers=headers)
2013-01-11 06:14:16 +01:00
if response.status_code != 200 or response.content == "[]":
sys.stderr.write("no valid reply from zotero\n")
sys.stderr.write("status %d\n" % response.status_code)
sys.stderr.write("content %s\n" % response.content)
return -1 # fatal
2013-01-11 06:14:16 +01:00
sys.stderr.write("content %s\n" % response.content)
# see if there are any attachments
content = json.loads(response.content)
item = content[0]
title = item["title"]
2013-01-11 06:14:16 +01:00
if not item.has_key("attachments"):
sys.stderr.write("no attachement with this proxy\n")
return 1 # try another proxy
2013-01-11 06:14:16 +01:00
pdf_url = None
for attachment in item["attachments"]:
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
pdf_url = attachment["url"]
break
2013-01-11 06:14:16 +01:00
if not pdf_url:
sys.stderr.write("no PDF attachement with this proxy\n")
return 1 # try another proxy
2013-01-11 06:14:16 +01:00
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
2013-01-11 06:14:16 +01:00
headers = {
"User-Agent": user_agent,
}
2013-01-11 06:14:16 +01:00
sys.stderr.write("try retrieving " +
str(pdf_url) + " through proxy " + proxy + "\n")
response = None
session = requests.Session()
session.proxies = {
'http': proxy,
'https': proxy}
2013-01-11 06:14:16 +01:00
try:
if pdf_url.startswith("https://"):
response = session.get(pdf_url, headers=headers, verify=False)
else:
response = session.get(pdf_url, headers=headers)
except requests.exceptions.ConnectionError:
sys.stderr.write("network failure on download " +
str(pdf_url) + "\n")
return 1
# detect failure
if response.status_code == 401:
sys.stderr.write("HTTP 401 unauthorized when trying to fetch " +
str(pdf_url) + "\n")
return 1
elif response.status_code != 200:
sys.stderr.write("HTTP " + str(response.status_code)
+ " when trying to fetch " + str(pdf_url) + "\n")
return 1
data = response.content
2013-01-11 06:14:16 +01:00
if "pdf" in response.headers["content-type"]:
try:
data = pdfparanoia.scrub(StringIO(data))
except:
# this is to avoid a PDFNotImplementedError
pass
2013-01-11 06:14:16 +01:00
# grr..
title = title.encode("ascii", "ignore")
title = title.replace(" ", "_")
title = title[:params.maxlen]
path = os.path.join(params.folder, title + ".pdf")
2013-01-11 06:14:16 +01:00
file_handler = open(path, "w")
file_handler.write(data)
file_handler.close()
2013-01-11 06:14:16 +01:00
filename = requests.utils.quote(title)
2013-01-11 06:14:16 +01:00
# Remove an ending period, which sometimes happens when the
# title of the paper has a period at the end.
if filename[-1] == ".":
filename = filename[:-1]
2013-02-10 02:22:51 +01:00
url = params.url + filename + ".pdf"
2013-02-10 02:22:51 +01:00
print(url)
return 0
def download(line, verbose=True):
"""
Downloads a paper.
"""
# don't bother if there's nothing there
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
line = filter_fix(line)
# fix for login.jsp links to ieee xplore
line = fix_ieee_login_urls(line)
line = fix_jstor_pdf_urls(line)
ok = False
for (zotero, proxy) in params.servers:
s = download_proxy(line, zotero, proxy, verbose)
if s < 0:
break
if s == 0:
ok = True
break
if not ok:
for (zotero, proxy) in params.servers:
s = download_url(line, proxy)
sys.stderr.write("return code " + str(s) + "\n")
if s == 0:
ok = True
break
if not ok:
s = download_url(line, params.servers[0][1], last_resort=True)
if s != 0:
print "couldn't get it at all :("
2013-01-11 06:14:16 +01:00
return
2013-01-08 07:27:46 +01:00
download.commands = ["fetch", "get", "download"]
download.priority = "high"
download.rule = r'(.*)'
2013-01-10 19:16:55 +01:00
def download_ieee(url):
"""
Downloads an IEEE paper. The Zotero translator requires frames/windows to
be available. Eventually translation-server will be fixed, but until then
it might be nice to have an IEEE workaround.
"""
# url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"
# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
raise NotImplementedError
def download_url(url, proxy, last_resort=False):
sys.stderr.write("attempting direct for %s through %s\n" % (url,
proxy))
session = requests.Session()
session.proxies = {
'http': proxy,
'https': proxy}
try:
response = session.get(url, headers={"User-Agent": "origami-pdf"})
except requests.exceptions.ConnectionError:
sys.stderr.write("network failure on download " +
str(url) + "\n")
return 1
2013-01-10 19:16:55 +01:00
content = response.content
2013-01-11 00:14:41 +01:00
2013-01-16 09:19:09 +01:00
# just make up a default filename
2013-01-10 19:16:55 +01:00
title = "%0.2x" % random.getrandbits(128)
2013-01-16 09:19:09 +01:00
# default extension
extension = ".txt"
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
elif check_if_html(response):
# parse the html string with lxml.etree
tree = parse_html(content)
# extract some metadata with xpaths
citation_pdf_url = find_citation_pdf_url(tree, url)
citation_title = find_citation_title(tree)
2013-02-07 03:00:50 +01:00
# aip.org sucks, citation_pdf_url is wrong
if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
2013-02-07 03:00:50 +01:00
citation_pdf_url = None
if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
content = session.get(citation_pdf_url).content
2013-02-08 11:45:43 +01:00
tree = parse_html(content)
# citation_title = ...
2013-02-08 11:45:43 +01:00
2013-02-07 02:57:42 +01:00
# wow, this seriously needs to be cleaned up
2013-02-08 11:45:43 +01:00
if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
2013-01-16 09:43:20 +01:00
citation_title = citation_title.encode("ascii", "ignore")
response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
2013-01-16 09:19:09 +01:00
content = response.content
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
title = citation_title
else:
if "sciencedirect.com" in url and not "ShoppingCart" in url:
2013-01-24 03:01:03 +01:00
try:
title = tree.xpath("//h1[@class='svTitle']")[0].text
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"})
2013-01-24 03:01:03 +01:00
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
except Exception:
pass
else:
content = new_content
response = new_response
2013-02-22 00:11:28 +01:00
elif "jstor.org/" in url:
# clean up the url
if "?" in url:
url = url[0:url.find("?")]
# not all pages have the <input type="hidden" name="ppv-title"> element
try:
2013-02-22 00:29:53 +01:00
title = tree.xpath("//div[@class='hd title']")[0].text
2013-02-22 00:11:28 +01:00
except Exception:
2013-02-22 00:29:53 +01:00
try:
title = tree.xpath("//input[@name='ppv-title']/@value")[0]
except Exception:
pass
2013-02-22 00:11:28 +01:00
# get the document id
document_id = None
if url[-1] != "/":
#if "stable/" in url:
#elif "discover/" in url:
#elif "action/showShelf?candidate=" in url:
#elif "pss/" in url:
document_id = url.split("/")[-1]
if document_id.isdigit():
try:
pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
2013-02-22 00:11:28 +01:00
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
except Exception:
pass
else:
content = new_content
response = new_response
elif ".aip.org/" in url:
2013-02-07 02:57:42 +01:00
try:
title = tree.xpath("//title/text()")[0].split(" | ")[0]
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
2013-02-07 02:57:42 +01:00
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
except Exception:
pass
else:
content = new_content
response = new_response
2013-02-08 11:16:10 +01:00
elif "ieeexplore.ieee.org" in url:
try:
pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
2013-02-08 11:16:10 +01:00
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
except Exception:
pass
else:
content = new_content
response = new_response
2013-01-21 05:19:57 +01:00
elif "h1 class=\"articleTitle" in content:
2013-01-16 09:53:18 +01:00
try:
2013-01-16 09:56:06 +01:00
title = tree.xpath("//h1[@class='articleTitle']")[0].text
2013-01-16 09:53:18 +01:00
title = title.encode("ascii", "ignore")
pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
except:
pass
else:
if pdf_url.startswith("/"):
url_start = url[:url.find("/",8)]
pdf_url = url_start + pdf_url
response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
2013-01-16 09:53:18 +01:00
content = response.content
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
# raise Exception("problem with citation_pdf_url or citation_title")
# well, at least save the contents from the original url
pass
2013-01-16 09:19:09 +01:00
# make the title again just in case
if not title:
title = "%0.2x" % random.getrandbits(128)
2013-01-21 18:44:13 +01:00
# can't create directories
title = title.replace("/", "_")
2013-05-11 11:57:28 +02:00
title = title.replace(" ", "_")
title = title[:params.maxlen]
2013-01-21 18:44:13 +01:00
2013-05-11 11:57:28 +02:00
path = os.path.join(params.folder, title + extension)
2013-01-10 19:16:55 +01:00
2013-02-09 14:41:50 +01:00
if extension in [".pdf", "pdf"]:
2013-02-11 17:03:28 +01:00
try:
sys.stderr.write("got it! " +
str(url) + "\n")
2013-02-11 17:03:28 +01:00
content = pdfparanoia.scrub(StringIO(content))
except:
# this is to avoid a PDFNotImplementedError
pass
2013-02-09 14:41:50 +01:00
2013-01-10 19:16:55 +01:00
file_handler = open(path, "w")
file_handler.write(content)
file_handler.close()
title = title.encode("ascii", "ignore")
2013-05-11 11:57:28 +02:00
url = params.url + requests.utils.quote(title) + extension
2013-01-10 19:16:55 +01:00
if extension in [".pdf", "pdf"]:
print url
return 0
else:
sys.stderr.write("couldn't find it, dump: %s\n" % url)
if last_resort:
print "couldn't find it, dump: %s" % url
else:
return 1
return 0
2013-01-10 19:16:55 +01:00
2013-01-16 09:19:09 +01:00
def parse_html(content):
if not isinstance(content, StringIO):
content = StringIO(content)
parser = lxml.etree.HTMLParser()
tree = lxml.etree.parse(content, parser)
return tree
def check_if_html(response):
return "text/html" in response.headers["content-type"]
def find_citation_pdf_url(tree, url):
"""
Returns the <meta name="citation_pdf_url"> content attribute.
"""
citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
if citation_pdf_url and not citation_pdf_url.startswith("http"):
2013-01-16 09:19:09 +01:00
if citation_pdf_url.startswith("/"):
url_start = url[:url.find("/",8)]
citation_pdf_url = url_start + citation_pdf_url
else:
raise Exception("unhandled situation (citation_pdf_url)")
return citation_pdf_url
def find_citation_title(tree):
"""
Returns the <meta name="citation_title"> content attribute.
"""
citation_title = extract_meta_content(tree, "citation_title")
return citation_title
def extract_meta_content(tree, meta_name):
try:
content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
except:
return None
else:
return content
def filter_fix(url):
"""
Fixes some common problems in urls.
"""
if ".proxy.lib.pdx.edu" in url:
url = url.replace(".proxy.lib.pdx.edu", "")
return url
2013-01-22 02:11:12 +01:00
def fix_ieee_login_urls(url):
"""
Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
page, and then this link is given to paperbot. The actual link is based on
the arnumber.
example:
http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
"""
if "ieeexplore.ieee.org/xpl/login.jsp" in url:
if "arnumber=" in url:
parts = url.split("arnumber=")
# i guess the url might not look like the example in the docstring
if "&" in parts[1]:
arnumber = parts[1].split("&")[0]
else:
arnumber = parts[1]
return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber
# default case when things go wrong
return url
2013-02-22 00:13:22 +01:00
def fix_jstor_pdf_urls(url):
"""
Fixes urls pointing to jstor pdfs.
"""
if "jstor.org/" in url:
if ".pdf" in url and not "?acceptTC=true" in url:
url += "?acceptTC=true"
return url
2013-05-11 11:57:28 +02:00
if __name__ == '__main__':
if len(sys.argv) > 1:
for a in sys.argv[1:]:
download(a)
else:
reqs = []
2013-05-11 11:57:28 +02:00
while True:
l = sys.stdin.readline()
if not l:
break
if l.startswith("help") or l.startswith("HELP"):
print params.help
reqs.append(time())
if len(reqs) > params.thresh:
delay = time() - reqs[len(reqs) - params.thresh + 1]
if params.limit - delay > 0:
print "rate limit exceeded, try again in %d second(s)" % (params.limit - delay)
else:
download(l)
2013-05-11 11:57:28 +02:00