Config file, SOCKS support, multiple servers

This commit is contained in:
Antoine Amarilli 2013-05-11 16:10:48 +02:00
parent 86c2e11a8c
commit e19aa9e534
4 changed files with 216 additions and 118 deletions

View File

@ -1,12 +1,16 @@
# paperbot # paperbot
Paperbot is an IRC bot that fetches academic papers. It monitors all conversation for links to scholarly content, then fetches the content and posts a public link. This seems to help enhance the quality of discussion and make us less ignorant. When a link fails to lead to a pdf with the zotero translators, paperbot will not attempt further downloads of the paper unless paperbot was specifically spoken to. Paperbot is an command line utility that fetches academic papers. When given a URL on stdin or as a CLI argument, it fetches the content and returns a public link on stdout. This seems to help enhance the quality of discussion and make us less ignorant.
Paperbot can easily be turned back into an IRC bot with [irctk](http://gitorious.org/irctk)
<div id="details" /> <div id="details" />
<div id="deets" /> <div id="deets" />
## deets ## deets
All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. Sometimes in IRC someone drops a link straight to a pdf, which paperbot is also happy to compulsively archive. All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. When given a link straight to a pdf, which paperbot is also happy to compulsively archive it.
Paperbot can try multiple instances of translation-server (configured to use different ways to access content) and different SOCKS proxies to retrieve the content.
* [zotero/translators](https://github.com/zotero/translators) * [zotero/translators](https://github.com/zotero/translators)
* [zotero/translation-server](https://github.com/zotero/translation-server) * [zotero/translation-server](https://github.com/zotero/translation-server)
@ -14,19 +18,9 @@ All content is scraped using [zotero/translators](https://github.com/zotero/tran
* [phenny](https://github.com/sbp/phenny) * [phenny](https://github.com/sbp/phenny)
* [pdfparanoia](https://github.com/kanzure/pdfparanoia) * [pdfparanoia](https://github.com/kanzure/pdfparanoia)
<div id="todo" />
## TODO
It would be nice to use multiple proxies to resolve a pdf request.
<div id="demo" />
<div id="channel" />
## active demo
say hi to paperbot on irc.freenode.net ##hplusroadmap
<div id="license" /> <div id="license" />
## license ## license
BSD. BSD.
Original project is: https://github.com/kanzure/paperbot

184
papers.py
View File

@ -1,4 +1,5 @@
#!/usr/bin/python #!/usr/bin/python -u
# coding=utf8
""" """
Fetches papers. Fetches papers.
""" """
@ -7,29 +8,17 @@ import os
import json import json
import params import params
import random import random
import requests import requesocks as requests
import lxml.etree import lxml.etree
import sys import sys
from time import time
from StringIO import StringIO from StringIO import StringIO
import pdfparanoia import pdfparanoia
def download(line, verbose=True): def download_proxy(line, zotero, proxy, verbose=True):
""" sys.stderr.write("attempting download of %s through %s and %s\n" %
Downloads a paper. (line, zotero, proxy))
"""
# don't bother if there's nothing there
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
line = filter_fix(line)
# fix for login.jsp links to ieee xplore
line = fix_ieee_login_urls(line)
line = fix_jstor_pdf_urls(line)
translation_url = params.server
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
@ -42,41 +31,67 @@ def download(line, verbose=True):
data = json.dumps(data) data = json.dumps(data)
response = requests.post(translation_url, data=data, headers=headers) response = requests.post(zotero, data=data, headers=headers)
if response.status_code == 200 and response.content != "[]": if response.status_code != 200 or response.content == "[]":
sys.stderr.write("no valid reply from zotero\n")
sys.stderr.write("status %d\n" % response.status_code)
sys.stderr.write("content %s\n" % response.content)
return -1 # fatal
sys.stderr.write("content %s\n" % response.content)
# see if there are any attachments # see if there are any attachments
content = json.loads(response.content) content = json.loads(response.content)
item = content[0] item = content[0]
title = item["title"] title = item["title"]
if item.has_key("attachments"): if not item.has_key("attachments"):
sys.stderr.write("no attachement with this proxy\n")
return 1 # try another proxy
pdf_url = None pdf_url = None
for attachment in item["attachments"]: for attachment in item["attachments"]:
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]: if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
pdf_url = attachment["url"] pdf_url = attachment["url"]
break break
if pdf_url: if not pdf_url:
sys.stderr.write("no PDF attachement with this proxy\n")
return 1 # try another proxy
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
headers = { headers = {
"User-Agent": user_agent, "User-Agent": user_agent,
} }
sys.stderr.write("try retrieving " +
str(pdf_url) + " through proxy " + proxy + "\n")
response = None response = None
session = requests.Session()
session.proxies = {
'http': proxy,
'https': proxy}
try:
if pdf_url.startswith("https://"): if pdf_url.startswith("https://"):
response = requests.get(pdf_url, headers=headers, verify=False) response = session.get(pdf_url, headers=headers, verify=False)
else: else:
response = requests.get(pdf_url, headers=headers) response = session.get(pdf_url, headers=headers)
except requests.exceptions.ConnectionError:
sys.stderr.write("network failure on download " +
str(pdf_url) + "\n")
return 1
# detect failure # detect failure
if response.status_code == 401: if response.status_code == 401:
print("HTTP 401 unauthorized " + str(pdf_url)) sys.stderr.write("HTTP 401 unauthorized when trying to fetch " +
continue str(pdf_url) + "\n")
return 1
elif response.status_code != 200: elif response.status_code != 200:
print("HTTP " + str(response.status_code) + " " + str(pdf_url)) sys.stderr.write("HTTP " + str(response.status_code)
continue + " when trying to fetch " + str(pdf_url) + "\n")
return 1
data = response.content data = response.content
@ -89,6 +104,8 @@ def download(line, verbose=True):
# grr.. # grr..
title = title.encode("ascii", "ignore") title = title.encode("ascii", "ignore")
title = title.replace(" ", "_")
title = title[:params.maxlen]
path = os.path.join(params.folder, title + ".pdf") path = os.path.join(params.folder, title + ".pdf")
@ -106,21 +123,47 @@ def download(line, verbose=True):
url = params.url + filename + ".pdf" url = params.url + filename + ".pdf"
print(url) print(url)
continue return 0
else:
print(download_url(line))
continue def download(line, verbose=True):
else: """
print(download_url(line)) Downloads a paper.
continue """
else:
if response.status_code == 501: # don't bother if there's nothing there
if verbose: if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
print("no translator available, raw dump: " + download_url(line))
else:
if verbose:
print("error: HTTP " + str(response.status_code) + " " + download_url(line))
return return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
line = filter_fix(line)
# fix for login.jsp links to ieee xplore
line = fix_ieee_login_urls(line)
line = fix_jstor_pdf_urls(line)
ok = False
for (zotero, proxy) in params.servers:
s = download_proxy(line, zotero, proxy, verbose)
if s < 0:
break
if s == 0:
ok = True
break
if not ok:
for (zotero, proxy) in params.servers:
s = download_url(line, proxy)
sys.stderr.write("return code " + str(s) + "\n")
if s == 0:
ok = True
break
if not ok:
s = download_url(line, params.servers[0][1], last_resort=True)
if s != 0:
print "couldn't get it at all :("
return
download.commands = ["fetch", "get", "download"] download.commands = ["fetch", "get", "download"]
download.priority = "high" download.priority = "high"
download.rule = r'(.*)' download.rule = r'(.*)'
@ -135,8 +178,22 @@ def download_ieee(url):
# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261" # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
raise NotImplementedError raise NotImplementedError
def download_url(url): def download_url(url, proxy, last_resort=False):
response = requests.get(url, headers={"User-Agent": "origami-pdf"}) sys.stderr.write("attempting direct for %s through %s\n" % (url,
proxy))
session = requests.Session()
session.proxies = {
'http': proxy,
'https': proxy}
try:
response = session.get(url, headers={"User-Agent": "origami-pdf"})
except requests.exceptions.ConnectionError:
sys.stderr.write("network failure on download " +
str(url) + "\n")
return 1
content = response.content content = response.content
# just make up a default filename # just make up a default filename
@ -160,14 +217,14 @@ def download_url(url):
citation_pdf_url = None citation_pdf_url = None
if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url: if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
content = requests.get(citation_pdf_url).content content = session.get(citation_pdf_url).content
tree = parse_html(content) tree = parse_html(content)
# citation_title = ... # citation_title = ...
# wow, this seriously needs to be cleaned up # wow, this seriously needs to be cleaned up
if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url: if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
citation_title = citation_title.encode("ascii", "ignore") citation_title = citation_title.encode("ascii", "ignore")
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"}) response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
content = response.content content = response.content
if "pdf" in response.headers["content-type"]: if "pdf" in response.headers["content-type"]:
extension = ".pdf" extension = ".pdf"
@ -177,7 +234,7 @@ def download_url(url):
try: try:
title = tree.xpath("//h1[@class='svTitle']")[0].text title = tree.xpath("//h1[@class='svTitle']")[0].text
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"}) new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"})
new_content = new_response.content new_content = new_response.content
if "pdf" in new_response.headers["content-type"]: if "pdf" in new_response.headers["content-type"]:
extension = ".pdf" extension = ".pdf"
@ -212,7 +269,7 @@ def download_url(url):
if document_id.isdigit(): if document_id.isdigit():
try: try:
pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true" pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"}) new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
new_content = new_response.content new_content = new_response.content
if "pdf" in new_response.headers["content-type"]: if "pdf" in new_response.headers["content-type"]:
extension = ".pdf" extension = ".pdf"
@ -225,7 +282,7 @@ def download_url(url):
try: try:
title = tree.xpath("//title/text()")[0].split(" | ")[0] title = tree.xpath("//title/text()")[0].split(" | ")[0]
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0] pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"}) new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
new_content = new_response.content new_content = new_response.content
if "pdf" in new_response.headers["content-type"]: if "pdf" in new_response.headers["content-type"]:
extension = ".pdf" extension = ".pdf"
@ -237,7 +294,7 @@ def download_url(url):
elif "ieeexplore.ieee.org" in url: elif "ieeexplore.ieee.org" in url:
try: try:
pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0] pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"}) new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
new_content = new_response.content new_content = new_response.content
if "pdf" in new_response.headers["content-type"]: if "pdf" in new_response.headers["content-type"]:
extension = ".pdf" extension = ".pdf"
@ -257,7 +314,7 @@ def download_url(url):
if pdf_url.startswith("/"): if pdf_url.startswith("/"):
url_start = url[:url.find("/",8)] url_start = url[:url.find("/",8)]
pdf_url = url_start + pdf_url pdf_url = url_start + pdf_url
response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"}) response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
content = response.content content = response.content
if "pdf" in response.headers["content-type"]: if "pdf" in response.headers["content-type"]:
extension = ".pdf" extension = ".pdf"
@ -272,12 +329,14 @@ def download_url(url):
# can't create directories # can't create directories
title = title.replace("/", "_") title = title.replace("/", "_")
title = title.replace(" ", "_") title = title.replace(" ", "_")
title = title[:20] title = title[:params.maxlen]
path = os.path.join(params.folder, title + extension) path = os.path.join(params.folder, title + extension)
if extension in [".pdf", "pdf"]: if extension in [".pdf", "pdf"]:
try: try:
sys.stderr.write("got it! " +
str(url) + "\n")
content = pdfparanoia.scrub(StringIO(content)) content = pdfparanoia.scrub(StringIO(content))
except: except:
# this is to avoid a PDFNotImplementedError # this is to avoid a PDFNotImplementedError
@ -290,7 +349,17 @@ def download_url(url):
title = title.encode("ascii", "ignore") title = title.encode("ascii", "ignore")
url = params.url + requests.utils.quote(title) + extension url = params.url + requests.utils.quote(title) + extension
return url if extension in [".pdf", "pdf"]:
print url
return 0
else:
sys.stderr.write("couldn't find it, dump: %s\n" % url)
if last_resort:
print "couldn't find it, dump: %s" % url
else:
return 1
return 0
def parse_html(content): def parse_html(content):
if not isinstance(content, StringIO): if not isinstance(content, StringIO):
@ -377,10 +446,19 @@ if __name__ == '__main__':
for a in sys.argv[1:]: for a in sys.argv[1:]:
download(a) download(a)
else: else:
reqs = []
while True: while True:
l = sys.stdin.readline() l = sys.stdin.readline()
if not l: if not l:
break break
if l.startswith("help") or l.startswith("HELP"):
print params.help
reqs.append(time())
if len(reqs) > params.thresh:
delay = time() - reqs[len(reqs) - params.thresh + 1]
if params.limit - delay > 0:
print "rate limit exceeded, try again in %d second(s)" % (params.limit - delay)
else:
download(l) download(l)

26
params.py.example Normal file
View File

@ -0,0 +1,26 @@
# the folder in which the papers should be stored
folder = "/home/user/www/papers/"
# the URL at which the papers will be served by an existing HTTP server
url = "http://example.com/papers/"
# the length at which the file names should be truncated
maxlen = 42
# the maximum number of requests to allow before throttling
thresh = 10
# the time period after which throttling stops
limit = 60
# help message
help = "a nice help message"
# the various servers available to retrieve the content
# the first tuple item is the URL for the translation-server instance
# the second tuple item is the URL of the socks proxy to use
servers = [
("http://localhost:1969/web", 'socks4://127.0.0.1:9999'),
]

View File

@ -1,3 +1,3 @@
phenny requesocks
requests
-e git://github.com/kanzure/pdfparanoia.git@master#egg=pdfparanoia -e git://github.com/kanzure/pdfparanoia.git@master#egg=pdfparanoia
copy params.py.example as params.py and customize it