Config file, SOCKS support, multiple servers
This commit is contained in:
parent
86c2e11a8c
commit
e19aa9e534
20
README.md
20
README.md
@ -1,12 +1,16 @@
|
|||||||
# paperbot
|
# paperbot
|
||||||
|
|
||||||
Paperbot is an IRC bot that fetches academic papers. It monitors all conversation for links to scholarly content, then fetches the content and posts a public link. This seems to help enhance the quality of discussion and make us less ignorant. When a link fails to lead to a pdf with the zotero translators, paperbot will not attempt further downloads of the paper unless paperbot was specifically spoken to.
|
Paperbot is an command line utility that fetches academic papers. When given a URL on stdin or as a CLI argument, it fetches the content and returns a public link on stdout. This seems to help enhance the quality of discussion and make us less ignorant.
|
||||||
|
|
||||||
|
Paperbot can easily be turned back into an IRC bot with [irctk](http://gitorious.org/irctk)
|
||||||
|
|
||||||
<div id="details" />
|
<div id="details" />
|
||||||
<div id="deets" />
|
<div id="deets" />
|
||||||
## deets
|
## deets
|
||||||
|
|
||||||
All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. Sometimes in IRC someone drops a link straight to a pdf, which paperbot is also happy to compulsively archive.
|
All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. When given a link straight to a pdf, which paperbot is also happy to compulsively archive it.
|
||||||
|
|
||||||
|
Paperbot can try multiple instances of translation-server (configured to use different ways to access content) and different SOCKS proxies to retrieve the content.
|
||||||
|
|
||||||
* [zotero/translators](https://github.com/zotero/translators)
|
* [zotero/translators](https://github.com/zotero/translators)
|
||||||
* [zotero/translation-server](https://github.com/zotero/translation-server)
|
* [zotero/translation-server](https://github.com/zotero/translation-server)
|
||||||
@ -14,19 +18,9 @@ All content is scraped using [zotero/translators](https://github.com/zotero/tran
|
|||||||
* [phenny](https://github.com/sbp/phenny)
|
* [phenny](https://github.com/sbp/phenny)
|
||||||
* [pdfparanoia](https://github.com/kanzure/pdfparanoia)
|
* [pdfparanoia](https://github.com/kanzure/pdfparanoia)
|
||||||
|
|
||||||
<div id="todo" />
|
|
||||||
## TODO
|
|
||||||
|
|
||||||
It would be nice to use multiple proxies to resolve a pdf request.
|
|
||||||
|
|
||||||
<div id="demo" />
|
|
||||||
<div id="channel" />
|
|
||||||
## active demo
|
|
||||||
|
|
||||||
say hi to paperbot on irc.freenode.net ##hplusroadmap
|
|
||||||
|
|
||||||
<div id="license" />
|
<div id="license" />
|
||||||
## license
|
## license
|
||||||
|
|
||||||
BSD.
|
BSD.
|
||||||
|
Original project is: https://github.com/kanzure/paperbot
|
||||||
|
|
||||||
|
284
papers.py
284
papers.py
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python -u
|
||||||
|
# coding=utf8
|
||||||
"""
|
"""
|
||||||
Fetches papers.
|
Fetches papers.
|
||||||
"""
|
"""
|
||||||
@ -7,13 +8,124 @@ import os
|
|||||||
import json
|
import json
|
||||||
import params
|
import params
|
||||||
import random
|
import random
|
||||||
import requests
|
import requesocks as requests
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import sys
|
import sys
|
||||||
|
from time import time
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
import pdfparanoia
|
import pdfparanoia
|
||||||
|
|
||||||
|
def download_proxy(line, zotero, proxy, verbose=True):
|
||||||
|
sys.stderr.write("attempting download of %s through %s and %s\n" %
|
||||||
|
(line, zotero, proxy))
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"url": line,
|
||||||
|
"sessionid": "what"
|
||||||
|
}
|
||||||
|
|
||||||
|
data = json.dumps(data)
|
||||||
|
|
||||||
|
response = requests.post(zotero, data=data, headers=headers)
|
||||||
|
|
||||||
|
if response.status_code != 200 or response.content == "[]":
|
||||||
|
sys.stderr.write("no valid reply from zotero\n")
|
||||||
|
sys.stderr.write("status %d\n" % response.status_code)
|
||||||
|
sys.stderr.write("content %s\n" % response.content)
|
||||||
|
return -1 # fatal
|
||||||
|
|
||||||
|
sys.stderr.write("content %s\n" % response.content)
|
||||||
|
# see if there are any attachments
|
||||||
|
content = json.loads(response.content)
|
||||||
|
item = content[0]
|
||||||
|
title = item["title"]
|
||||||
|
|
||||||
|
if not item.has_key("attachments"):
|
||||||
|
sys.stderr.write("no attachement with this proxy\n")
|
||||||
|
return 1 # try another proxy
|
||||||
|
|
||||||
|
pdf_url = None
|
||||||
|
for attachment in item["attachments"]:
|
||||||
|
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
|
||||||
|
pdf_url = attachment["url"]
|
||||||
|
break
|
||||||
|
|
||||||
|
if not pdf_url:
|
||||||
|
sys.stderr.write("no PDF attachement with this proxy\n")
|
||||||
|
return 1 # try another proxy
|
||||||
|
|
||||||
|
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": user_agent,
|
||||||
|
}
|
||||||
|
|
||||||
|
sys.stderr.write("try retrieving " +
|
||||||
|
str(pdf_url) + " through proxy " + proxy + "\n")
|
||||||
|
response = None
|
||||||
|
session = requests.Session()
|
||||||
|
session.proxies = {
|
||||||
|
'http': proxy,
|
||||||
|
'https': proxy}
|
||||||
|
|
||||||
|
try:
|
||||||
|
if pdf_url.startswith("https://"):
|
||||||
|
response = session.get(pdf_url, headers=headers, verify=False)
|
||||||
|
else:
|
||||||
|
response = session.get(pdf_url, headers=headers)
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
sys.stderr.write("network failure on download " +
|
||||||
|
str(pdf_url) + "\n")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# detect failure
|
||||||
|
if response.status_code == 401:
|
||||||
|
sys.stderr.write("HTTP 401 unauthorized when trying to fetch " +
|
||||||
|
str(pdf_url) + "\n")
|
||||||
|
return 1
|
||||||
|
elif response.status_code != 200:
|
||||||
|
sys.stderr.write("HTTP " + str(response.status_code)
|
||||||
|
+ " when trying to fetch " + str(pdf_url) + "\n")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
data = response.content
|
||||||
|
|
||||||
|
if "pdf" in response.headers["content-type"]:
|
||||||
|
try:
|
||||||
|
data = pdfparanoia.scrub(StringIO(data))
|
||||||
|
except:
|
||||||
|
# this is to avoid a PDFNotImplementedError
|
||||||
|
pass
|
||||||
|
|
||||||
|
# grr..
|
||||||
|
title = title.encode("ascii", "ignore")
|
||||||
|
title = title.replace(" ", "_")
|
||||||
|
title = title[:params.maxlen]
|
||||||
|
|
||||||
|
path = os.path.join(params.folder, title + ".pdf")
|
||||||
|
|
||||||
|
file_handler = open(path, "w")
|
||||||
|
file_handler.write(data)
|
||||||
|
file_handler.close()
|
||||||
|
|
||||||
|
filename = requests.utils.quote(title)
|
||||||
|
|
||||||
|
# Remove an ending period, which sometimes happens when the
|
||||||
|
# title of the paper has a period at the end.
|
||||||
|
if filename[-1] == ".":
|
||||||
|
filename = filename[:-1]
|
||||||
|
|
||||||
|
url = params.url + filename + ".pdf"
|
||||||
|
|
||||||
|
print(url)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def download(line, verbose=True):
|
def download(line, verbose=True):
|
||||||
"""
|
"""
|
||||||
Downloads a paper.
|
Downloads a paper.
|
||||||
@ -29,98 +141,29 @@ def download(line, verbose=True):
|
|||||||
line = fix_ieee_login_urls(line)
|
line = fix_ieee_login_urls(line)
|
||||||
line = fix_jstor_pdf_urls(line)
|
line = fix_jstor_pdf_urls(line)
|
||||||
|
|
||||||
translation_url = params.server
|
ok = False
|
||||||
|
|
||||||
headers = {
|
for (zotero, proxy) in params.servers:
|
||||||
"Content-Type": "application/json",
|
s = download_proxy(line, zotero, proxy, verbose)
|
||||||
}
|
if s < 0:
|
||||||
|
break
|
||||||
|
if s == 0:
|
||||||
|
ok = True
|
||||||
|
break
|
||||||
|
if not ok:
|
||||||
|
for (zotero, proxy) in params.servers:
|
||||||
|
s = download_url(line, proxy)
|
||||||
|
sys.stderr.write("return code " + str(s) + "\n")
|
||||||
|
if s == 0:
|
||||||
|
ok = True
|
||||||
|
break
|
||||||
|
if not ok:
|
||||||
|
s = download_url(line, params.servers[0][1], last_resort=True)
|
||||||
|
if s != 0:
|
||||||
|
print "couldn't get it at all :("
|
||||||
|
|
||||||
data = {
|
|
||||||
"url": line,
|
|
||||||
"sessionid": "what"
|
|
||||||
}
|
|
||||||
|
|
||||||
data = json.dumps(data)
|
|
||||||
|
|
||||||
response = requests.post(translation_url, data=data, headers=headers)
|
|
||||||
|
|
||||||
if response.status_code == 200 and response.content != "[]":
|
|
||||||
# see if there are any attachments
|
|
||||||
content = json.loads(response.content)
|
|
||||||
item = content[0]
|
|
||||||
title = item["title"]
|
|
||||||
|
|
||||||
if item.has_key("attachments"):
|
|
||||||
pdf_url = None
|
|
||||||
for attachment in item["attachments"]:
|
|
||||||
if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
|
|
||||||
pdf_url = attachment["url"]
|
|
||||||
break
|
|
||||||
|
|
||||||
if pdf_url:
|
|
||||||
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"User-Agent": user_agent,
|
|
||||||
}
|
|
||||||
|
|
||||||
response = None
|
|
||||||
if pdf_url.startswith("https://"):
|
|
||||||
response = requests.get(pdf_url, headers=headers, verify=False)
|
|
||||||
else:
|
|
||||||
response = requests.get(pdf_url, headers=headers)
|
|
||||||
|
|
||||||
# detect failure
|
|
||||||
if response.status_code == 401:
|
|
||||||
print("HTTP 401 unauthorized " + str(pdf_url))
|
|
||||||
continue
|
|
||||||
elif response.status_code != 200:
|
|
||||||
print("HTTP " + str(response.status_code) + " " + str(pdf_url))
|
|
||||||
continue
|
|
||||||
|
|
||||||
data = response.content
|
|
||||||
|
|
||||||
if "pdf" in response.headers["content-type"]:
|
|
||||||
try:
|
|
||||||
data = pdfparanoia.scrub(StringIO(data))
|
|
||||||
except:
|
|
||||||
# this is to avoid a PDFNotImplementedError
|
|
||||||
pass
|
|
||||||
|
|
||||||
# grr..
|
|
||||||
title = title.encode("ascii", "ignore")
|
|
||||||
|
|
||||||
path = os.path.join(params.folder, title + ".pdf")
|
|
||||||
|
|
||||||
file_handler = open(path, "w")
|
|
||||||
file_handler.write(data)
|
|
||||||
file_handler.close()
|
|
||||||
|
|
||||||
filename = requests.utils.quote(title)
|
|
||||||
|
|
||||||
# Remove an ending period, which sometimes happens when the
|
|
||||||
# title of the paper has a period at the end.
|
|
||||||
if filename[-1] == ".":
|
|
||||||
filename = filename[:-1]
|
|
||||||
|
|
||||||
url = params.url + filename + ".pdf"
|
|
||||||
|
|
||||||
print(url)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
print(download_url(line))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
print(download_url(line))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
if response.status_code == 501:
|
|
||||||
if verbose:
|
|
||||||
print("no translator available, raw dump: " + download_url(line))
|
|
||||||
else:
|
|
||||||
if verbose:
|
|
||||||
print("error: HTTP " + str(response.status_code) + " " + download_url(line))
|
|
||||||
return
|
return
|
||||||
|
|
||||||
download.commands = ["fetch", "get", "download"]
|
download.commands = ["fetch", "get", "download"]
|
||||||
download.priority = "high"
|
download.priority = "high"
|
||||||
download.rule = r'(.*)'
|
download.rule = r'(.*)'
|
||||||
@ -135,8 +178,22 @@ def download_ieee(url):
|
|||||||
# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
|
# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def download_url(url):
|
def download_url(url, proxy, last_resort=False):
|
||||||
response = requests.get(url, headers={"User-Agent": "origami-pdf"})
|
sys.stderr.write("attempting direct for %s through %s\n" % (url,
|
||||||
|
proxy))
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.proxies = {
|
||||||
|
'http': proxy,
|
||||||
|
'https': proxy}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = session.get(url, headers={"User-Agent": "origami-pdf"})
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
sys.stderr.write("network failure on download " +
|
||||||
|
str(url) + "\n")
|
||||||
|
return 1
|
||||||
|
|
||||||
content = response.content
|
content = response.content
|
||||||
|
|
||||||
# just make up a default filename
|
# just make up a default filename
|
||||||
@ -160,14 +217,14 @@ def download_url(url):
|
|||||||
citation_pdf_url = None
|
citation_pdf_url = None
|
||||||
|
|
||||||
if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
|
if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
|
||||||
content = requests.get(citation_pdf_url).content
|
content = session.get(citation_pdf_url).content
|
||||||
tree = parse_html(content)
|
tree = parse_html(content)
|
||||||
# citation_title = ...
|
# citation_title = ...
|
||||||
|
|
||||||
# wow, this seriously needs to be cleaned up
|
# wow, this seriously needs to be cleaned up
|
||||||
if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
|
if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
|
||||||
citation_title = citation_title.encode("ascii", "ignore")
|
citation_title = citation_title.encode("ascii", "ignore")
|
||||||
response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
|
response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
|
||||||
content = response.content
|
content = response.content
|
||||||
if "pdf" in response.headers["content-type"]:
|
if "pdf" in response.headers["content-type"]:
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
@ -177,7 +234,7 @@ def download_url(url):
|
|||||||
try:
|
try:
|
||||||
title = tree.xpath("//h1[@class='svTitle']")[0].text
|
title = tree.xpath("//h1[@class='svTitle']")[0].text
|
||||||
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
|
pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
|
||||||
new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})
|
new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"})
|
||||||
new_content = new_response.content
|
new_content = new_response.content
|
||||||
if "pdf" in new_response.headers["content-type"]:
|
if "pdf" in new_response.headers["content-type"]:
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
@ -212,7 +269,7 @@ def download_url(url):
|
|||||||
if document_id.isdigit():
|
if document_id.isdigit():
|
||||||
try:
|
try:
|
||||||
pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
|
pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
|
||||||
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
|
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
|
||||||
new_content = new_response.content
|
new_content = new_response.content
|
||||||
if "pdf" in new_response.headers["content-type"]:
|
if "pdf" in new_response.headers["content-type"]:
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
@ -225,7 +282,7 @@ def download_url(url):
|
|||||||
try:
|
try:
|
||||||
title = tree.xpath("//title/text()")[0].split(" | ")[0]
|
title = tree.xpath("//title/text()")[0].split(" | ")[0]
|
||||||
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
|
pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
|
||||||
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
|
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
|
||||||
new_content = new_response.content
|
new_content = new_response.content
|
||||||
if "pdf" in new_response.headers["content-type"]:
|
if "pdf" in new_response.headers["content-type"]:
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
@ -237,7 +294,7 @@ def download_url(url):
|
|||||||
elif "ieeexplore.ieee.org" in url:
|
elif "ieeexplore.ieee.org" in url:
|
||||||
try:
|
try:
|
||||||
pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
|
pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
|
||||||
new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
|
new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
|
||||||
new_content = new_response.content
|
new_content = new_response.content
|
||||||
if "pdf" in new_response.headers["content-type"]:
|
if "pdf" in new_response.headers["content-type"]:
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
@ -257,7 +314,7 @@ def download_url(url):
|
|||||||
if pdf_url.startswith("/"):
|
if pdf_url.startswith("/"):
|
||||||
url_start = url[:url.find("/",8)]
|
url_start = url[:url.find("/",8)]
|
||||||
pdf_url = url_start + pdf_url
|
pdf_url = url_start + pdf_url
|
||||||
response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
|
response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
|
||||||
content = response.content
|
content = response.content
|
||||||
if "pdf" in response.headers["content-type"]:
|
if "pdf" in response.headers["content-type"]:
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
@ -272,12 +329,14 @@ def download_url(url):
|
|||||||
# can't create directories
|
# can't create directories
|
||||||
title = title.replace("/", "_")
|
title = title.replace("/", "_")
|
||||||
title = title.replace(" ", "_")
|
title = title.replace(" ", "_")
|
||||||
title = title[:20]
|
title = title[:params.maxlen]
|
||||||
|
|
||||||
path = os.path.join(params.folder, title + extension)
|
path = os.path.join(params.folder, title + extension)
|
||||||
|
|
||||||
if extension in [".pdf", "pdf"]:
|
if extension in [".pdf", "pdf"]:
|
||||||
try:
|
try:
|
||||||
|
sys.stderr.write("got it! " +
|
||||||
|
str(url) + "\n")
|
||||||
content = pdfparanoia.scrub(StringIO(content))
|
content = pdfparanoia.scrub(StringIO(content))
|
||||||
except:
|
except:
|
||||||
# this is to avoid a PDFNotImplementedError
|
# this is to avoid a PDFNotImplementedError
|
||||||
@ -290,7 +349,17 @@ def download_url(url):
|
|||||||
title = title.encode("ascii", "ignore")
|
title = title.encode("ascii", "ignore")
|
||||||
url = params.url + requests.utils.quote(title) + extension
|
url = params.url + requests.utils.quote(title) + extension
|
||||||
|
|
||||||
return url
|
if extension in [".pdf", "pdf"]:
|
||||||
|
print url
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
sys.stderr.write("couldn't find it, dump: %s\n" % url)
|
||||||
|
if last_resort:
|
||||||
|
print "couldn't find it, dump: %s" % url
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def parse_html(content):
|
def parse_html(content):
|
||||||
if not isinstance(content, StringIO):
|
if not isinstance(content, StringIO):
|
||||||
@ -377,10 +446,19 @@ if __name__ == '__main__':
|
|||||||
for a in sys.argv[1:]:
|
for a in sys.argv[1:]:
|
||||||
download(a)
|
download(a)
|
||||||
else:
|
else:
|
||||||
|
reqs = []
|
||||||
while True:
|
while True:
|
||||||
l = sys.stdin.readline()
|
l = sys.stdin.readline()
|
||||||
if not l:
|
if not l:
|
||||||
break
|
break
|
||||||
download(l)
|
if l.startswith("help") or l.startswith("HELP"):
|
||||||
|
print params.help
|
||||||
|
reqs.append(time())
|
||||||
|
if len(reqs) > params.thresh:
|
||||||
|
delay = time() - reqs[len(reqs) - params.thresh + 1]
|
||||||
|
if params.limit - delay > 0:
|
||||||
|
print "rate limit exceeded, try again in %d second(s)" % (params.limit - delay)
|
||||||
|
else:
|
||||||
|
download(l)
|
||||||
|
|
||||||
|
|
||||||
|
26
params.py.example
Normal file
26
params.py.example
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# the folder in which the papers should be stored
|
||||||
|
folder = "/home/user/www/papers/"
|
||||||
|
|
||||||
|
# the URL at which the papers will be served by an existing HTTP server
|
||||||
|
url = "http://example.com/papers/"
|
||||||
|
|
||||||
|
# the length at which the file names should be truncated
|
||||||
|
maxlen = 42
|
||||||
|
|
||||||
|
# the maximum number of requests to allow before throttling
|
||||||
|
thresh = 10
|
||||||
|
|
||||||
|
# the time period after which throttling stops
|
||||||
|
limit = 60
|
||||||
|
|
||||||
|
# help message
|
||||||
|
help = "a nice help message"
|
||||||
|
|
||||||
|
# the various servers available to retrieve the content
|
||||||
|
# the first tuple item is the URL for the translation-server instance
|
||||||
|
# the second tuple item is the URL of the socks proxy to use
|
||||||
|
servers = [
|
||||||
|
("http://localhost:1969/web", 'socks4://127.0.0.1:9999'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
|||||||
phenny
|
requesocks
|
||||||
requests
|
|
||||||
-e git://github.com/kanzure/pdfparanoia.git@master#egg=pdfparanoia
|
-e git://github.com/kanzure/pdfparanoia.git@master#egg=pdfparanoia
|
||||||
|
copy params.py.example as params.py and customize it
|
||||||
|
Loading…
Reference in New Issue
Block a user