Config file, SOCKS support, multiple servers

This commit is contained in:
Antoine Amarilli 2013-05-11 16:10:48 +02:00
parent 86c2e11a8c
commit e19aa9e534
4 changed files with 216 additions and 118 deletions

View File

@ -1,12 +1,16 @@
# paperbot
Paperbot is an IRC bot that fetches academic papers. It monitors all conversation for links to scholarly content, then fetches the content and posts a public link. This seems to help enhance the quality of discussion and make us less ignorant. When a link fails to lead to a pdf with the zotero translators, paperbot will not attempt further downloads of the paper unless paperbot was specifically spoken to.
Paperbot is an command line utility that fetches academic papers. When given a URL on stdin or as a CLI argument, it fetches the content and returns a public link on stdout. This seems to help enhance the quality of discussion and make us less ignorant.
Paperbot can easily be turned back into an IRC bot with [irctk](http://gitorious.org/irctk)
<div id="details" />
<div id="deets" />
## deets
All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. Sometimes in IRC someone drops a link straight to a pdf, which paperbot is also happy to compulsively archive.
All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. When given a link straight to a pdf, which paperbot is also happy to compulsively archive it.
Paperbot can try multiple instances of translation-server (configured to use different ways to access content) and different SOCKS proxies to retrieve the content.
* [zotero/translators](https://github.com/zotero/translators)
* [zotero/translation-server](https://github.com/zotero/translation-server)
@ -14,19 +18,9 @@ All content is scraped using [zotero/translators](https://github.com/zotero/tran
* [phenny](https://github.com/sbp/phenny)
* [pdfparanoia](https://github.com/kanzure/pdfparanoia)
<div id="todo" />
## TODO
It would be nice to use multiple proxies to resolve a pdf request.
<div id="demo" />
<div id="channel" />
## active demo
say hi to paperbot on irc.freenode.net ##hplusroadmap
<div id="license" />
## license
BSD.
Original project is: https://github.com/kanzure/paperbot

184
papers.py
View File

@ -1,4 +1,5 @@
#!/usr/bin/python
#!/usr/bin/python -u
# coding=utf8
"""
Fetches papers.
"""
@ -7,29 +8,17 @@ import os
import json
import params
import random
import requests
import requesocks as requests
import lxml.etree
import sys
from time import time
from StringIO import StringIO
import pdfparanoia
def download(line, verbose=True):
"""
Downloads a paper.
"""
# don't bother if there's nothing there
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
line = filter_fix(line)
# fix for login.jsp links to ieee xplore
line = fix_ieee_login_urls(line)
line = fix_jstor_pdf_urls(line)
translation_url = params.server
def download_proxy(line, zotero, proxy, verbose=True):
sys.stderr.write("attempting download of %s through %s and %s\n" %
(line, zotero, proxy))
headers = {
"Content-Type": "application/json",
@ -42,41 +31,67 @@ def download(line, verbose=True):
data = json.dumps(data)
response = requests.post(translation_url, data=data, headers=headers)
response = requests.post(zotero, data=data, headers=headers)
if response.status_code == 200 and response.content != "[]":
if response.status_code != 200 or response.content == "[]":
sys.stderr.write("no valid reply from zotero\n")
sys.stderr.write("status %d\n" % response.status_code)
sys.stderr.write("content %s\n" % response.content)
return -1 # fatal
sys.stderr.write("content %s\n" % response.content)
# see if there are any attachments
content = json.loads(response.content)
item = content[0]
title = item["title"]
if item.has_key("attachments"):
if not item.has_key("attachments"):
sys.stderr.write("no attachement with this proxy\n")
return 1 # try another proxy
pdf_url = None
for attachment in item["attachments"]:
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
pdf_url = attachment["url"]
break
if pdf_url:
if not pdf_url:
sys.stderr.write("no PDF attachement with this proxy\n")
return 1 # try another proxy
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
headers = {
"User-Agent": user_agent,
}
sys.stderr.write("try retrieving " +
str(pdf_url) + " through proxy " + proxy + "\n")
response = None
session = requests.Session()
session.proxies = {
'http': proxy,
'https': proxy}
try:
if pdf_url.startswith("https://"):
response = requests.get(pdf_url, headers=headers, verify=False)
response = session.get(pdf_url, headers=headers, verify=False)
else:
response = requests.get(pdf_url, headers=headers)
response = session.get(pdf_url, headers=headers)
except requests.exceptions.ConnectionError:
sys.stderr.write("network failure on download " +
str(pdf_url) + "\n")
return 1
# detect failure
if response.status_code == 401:
print("HTTP 401 unauthorized " + str(pdf_url))
continue
sys.stderr.write("HTTP 401 unauthorized when trying to fetch " +
str(pdf_url) + "\n")
return 1
elif response.status_code != 200:
print("HTTP " + str(response.status_code) + " " + str(pdf_url))
continue
sys.stderr.write("HTTP " + str(response.status_code)
+ " when trying to fetch " + str(pdf_url) + "\n")
return 1
data = response.content
@ -89,6 +104,8 @@ def download(line, verbose=True):
# grr..
title = title.encode("ascii", "ignore")
title = title.replace(" ", "_")
title = title[:params.maxlen]
path = os.path.join(params.folder, title + ".pdf")
@ -106,21 +123,47 @@ def download(line, verbose=True):
url = params.url + filename + ".pdf"
print(url)
continue
else:
print(download_url(line))
continue
else:
print(download_url(line))
continue
else:
if response.status_code == 501:
if verbose:
print("no translator available, raw dump: " + download_url(line))
else:
if verbose:
print("error: HTTP " + str(response.status_code) + " " + download_url(line))
return 0
def download(line, verbose=True):
"""
Downloads a paper.
"""
# don't bother if there's nothing there
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
line = filter_fix(line)
# fix for login.jsp links to ieee xplore
line = fix_ieee_login_urls(line)
line = fix_jstor_pdf_urls(line)
ok = False
for (zotero, proxy) in params.servers:
s = download_proxy(line, zotero, proxy, verbose)
if s < 0:
break
if s == 0:
ok = True
break
if not ok:
for (zotero, proxy) in params.servers:
s = download_url(line, proxy)
sys.stderr.write("return code " + str(s) + "\n")
if s == 0:
ok = True
break
if not ok:
s = download_url(line, params.servers[0][1], last_resort=True)
if s != 0:
print "couldn't get it at all :("
return
download.commands = ["fetch", "get", "download"]
download.priority = "high"
download.rule = r'(.*)'
@ -135,8 +178,22 @@ def download_ieee(url):
# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
raise NotImplementedError
def download_url(url):
response = requests.get(url, headers={"User-Agent": "origami-pdf"})
def download_url(url, proxy, last_resort=False):
sys.stderr.write("attempting direct for %s through %s\n" % (url,
proxy))
session = requests.Session()
session.proxies = {
'http': proxy,
'https': proxy}
try:
response = session.get(url, headers={"User-Agent": "origami-pdf"})
except requests.exceptions.ConnectionError:
sys.stderr.write("network failure on download " +
str(url) + "\n")
return 1
content = response.content
# just make up a default filename
@ -160,14 +217,14 @@ def download_url(url):
citation_pdf_url = None
if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
content = requests.get(citation_pdf_url).content
content = session.get(citation_pdf_url).content
tree = parse_html(content)
# citation_title = ...
# wow, this seriously needs to be cleaned up
if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
citation_title = citation_title.encode("ascii", "ignore")
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
content = response.content
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
@ -177,7 +234,7 @@ def download_url(url):
try:
title = tree.xpath("//h1[@class='svTitle']")[0].text
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})
new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"})
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
@ -212,7 +269,7 @@ def download_url(url):
if document_id.isdigit():
try:
pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
@ -225,7 +282,7 @@ def download_url(url):
try:
title = tree.xpath("//title/text()")[0].split(" | ")[0]
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
@ -237,7 +294,7 @@ def download_url(url):
elif "ieeexplore.ieee.org" in url:
try:
pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
new_content = new_response.content
if "pdf" in new_response.headers["content-type"]:
extension = ".pdf"
@ -257,7 +314,7 @@ def download_url(url):
if pdf_url.startswith("/"):
url_start = url[:url.find("/",8)]
pdf_url = url_start + pdf_url
response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
content = response.content
if "pdf" in response.headers["content-type"]:
extension = ".pdf"
@ -272,12 +329,14 @@ def download_url(url):
# can't create directories
title = title.replace("/", "_")
title = title.replace(" ", "_")
title = title[:20]
title = title[:params.maxlen]
path = os.path.join(params.folder, title + extension)
if extension in [".pdf", "pdf"]:
try:
sys.stderr.write("got it! " +
str(url) + "\n")
content = pdfparanoia.scrub(StringIO(content))
except:
# this is to avoid a PDFNotImplementedError
@ -290,7 +349,17 @@ def download_url(url):
title = title.encode("ascii", "ignore")
url = params.url + requests.utils.quote(title) + extension
return url
if extension in [".pdf", "pdf"]:
print url
return 0
else:
sys.stderr.write("couldn't find it, dump: %s\n" % url)
if last_resort:
print "couldn't find it, dump: %s" % url
else:
return 1
return 0
def parse_html(content):
if not isinstance(content, StringIO):
@ -377,10 +446,19 @@ if __name__ == '__main__':
for a in sys.argv[1:]:
download(a)
else:
reqs = []
while True:
l = sys.stdin.readline()
if not l:
break
if l.startswith("help") or l.startswith("HELP"):
print params.help
reqs.append(time())
if len(reqs) > params.thresh:
delay = time() - reqs[len(reqs) - params.thresh + 1]
if params.limit - delay > 0:
print "rate limit exceeded, try again in %d second(s)" % (params.limit - delay)
else:
download(l)

26
params.py.example Normal file
View File

@ -0,0 +1,26 @@
# the folder in which the papers should be stored
folder = "/home/user/www/papers/"
# the URL at which the papers will be served by an existing HTTP server
url = "http://example.com/papers/"
# the length at which the file names should be truncated
maxlen = 42
# the maximum number of requests to allow before throttling
thresh = 10
# the time period after which throttling stops
limit = 60
# help message
help = "a nice help message"
# the various servers available to retrieve the content
# the first tuple item is the URL for the translation-server instance
# the second tuple item is the URL of the socks proxy to use
servers = [
("http://localhost:1969/web", 'socks4://127.0.0.1:9999'),
]

View File

@ -1,3 +1,3 @@
phenny
requests
requesocks
-e git://github.com/kanzure/pdfparanoia.git@master#egg=pdfparanoia
copy params.py.example as params.py and customize it