Readd downloader code
This commit is contained in:
parent
4a89c8c136
commit
7a281528e3
110
libbmc/fetcher.py
Normal file
110
libbmc/fetcher.py
Normal file
@ -0,0 +1,110 @@
|
||||
"""
|
||||
This file contains functions to download locally some papers, eventually using
|
||||
a proxy.
|
||||
"""
|
||||
import socket
|
||||
import socks
|
||||
import sys
|
||||
import urllib
|
||||
|
||||
|
||||
# Default socket to use, if no proxy is used
|
||||
DEFAULT_SOCKET = socket.socket
|
||||
|
||||
|
||||
def download(url, proxies=[None]):
|
||||
"""
|
||||
Download a PDF or DJVU document from a url, eventually using proxies.
|
||||
|
||||
:params url: The URL to the PDF/DJVU document to fetch.
|
||||
:params proxies: An optional list of proxies to use. Proxies will be \
|
||||
used sequentially. Proxies should be a list of proxy strings. \
|
||||
Do not forget to include ``None`` in the list if you want to try \
|
||||
direct fetching without any proxy.
|
||||
|
||||
:returns: A tuple of the raw content of the downloaded data and its \
|
||||
associated content-type. Returns ``(None, None)`` if it was \
|
||||
unable to download the document.
|
||||
"""
|
||||
# Loop over all available connections
|
||||
for proxy in proxies:
|
||||
# Handle no proxy case
|
||||
if proxy is None:
|
||||
socket.socket = DEFAULT_SOCKET
|
||||
# Handle SOCKS proxy
|
||||
elif proxy.startswith('socks'):
|
||||
if proxy[5] == '4':
|
||||
proxy_type = socks.SOCKS4
|
||||
else:
|
||||
proxy_type = socks.SOCKS5
|
||||
proxy = proxy[proxy.find('://') + 3:]
|
||||
try:
|
||||
proxy, port = proxy.split(':')
|
||||
except ValueError:
|
||||
port = None
|
||||
socks.set_default_proxy(proxy_type, proxy, port)
|
||||
socket.socket = socks.socksocket
|
||||
# Handle generic HTTP proxy
|
||||
else:
|
||||
try:
|
||||
proxy, port = proxy.split(':')
|
||||
except ValueError:
|
||||
port = None
|
||||
socks.set_default_proxy(socks.HTTP, proxy, port)
|
||||
socket.socket = socks.socksocket
|
||||
|
||||
# Try to fetch the URL using the current proxy
|
||||
try:
|
||||
r = urllib.request.urlopen(url)
|
||||
try:
|
||||
size = int(dict(r.info())['content-length'].strip())
|
||||
except KeyError:
|
||||
try:
|
||||
size = int(dict(r.info())['Content-Length'].strip())
|
||||
except KeyError:
|
||||
size = 0
|
||||
# Download the document
|
||||
dl = b""
|
||||
dl_size = 0
|
||||
while True:
|
||||
buf = r.read(1024)
|
||||
if buf:
|
||||
dl += buf
|
||||
dl_size += len(buf)
|
||||
if size != 0:
|
||||
# Write progress bar on stdout
|
||||
done = int(50 * dl_size / size)
|
||||
sys.stdout.write("\r[%s%s]" %
|
||||
('='*done, ' '*(50-done)))
|
||||
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
break
|
||||
# Fetch content type
|
||||
contenttype = False
|
||||
contenttype_req = None
|
||||
try:
|
||||
contenttype_req = dict(r.info())['content-type']
|
||||
except KeyError:
|
||||
try:
|
||||
contenttype_req = dict(r.info())['Content-Type']
|
||||
except KeyError:
|
||||
continue
|
||||
if 'pdf' in contenttype_req:
|
||||
contenttype = 'pdf'
|
||||
elif 'djvu' in contenttype_req:
|
||||
contenttype = 'djvu'
|
||||
|
||||
# Check content type and status code are ok
|
||||
if r.getcode() != 200 or contenttype is False:
|
||||
# Else, try with the next available proxy
|
||||
continue
|
||||
|
||||
# Return a tuple of the downloaded content and the content-type
|
||||
return (dl, contenttype)
|
||||
# If an exception occurred, continue with next available proxy
|
||||
except (urllib.error.URLError, socket.error, ValueError):
|
||||
continue
|
||||
|
||||
# In case of running out of proxies, return (None, None)
|
||||
return (None, None)
|
Loading…
x
Reference in New Issue
Block a user