2016-01-10 18:19:30 +01:00
|
|
|
"""
|
|
|
|
This file contains functions to download locally some papers, eventually using
|
|
|
|
a proxy.
|
|
|
|
"""
|
|
|
|
import socket
|
|
|
|
import sys
|
|
|
|
import urllib
|
|
|
|
|
2016-02-17 16:04:36 +01:00
|
|
|
import socks
|
|
|
|
|
2016-01-10 18:19:30 +01:00
|
|
|
|
|
|
|
# Default socket to use, if no proxy is used
|
|
|
|
DEFAULT_SOCKET = socket.socket
|
|
|
|
|
|
|
|
|
2016-02-17 16:04:36 +01:00
|
|
|
def _download_helper(url):
|
|
|
|
"""
|
|
|
|
Handle the download of an URL, using the proxy currently set in \
|
|
|
|
:mod:`socks`.
|
|
|
|
|
|
|
|
:param url: The URL to download.
|
|
|
|
:returns: A tuple of the raw content of the downloaded data and its \
|
|
|
|
associated content-type. Returns None if it was \
|
|
|
|
unable to download the document.
|
|
|
|
"""
|
|
|
|
# Try to fetch the URL using the current proxy
|
|
|
|
try:
|
|
|
|
request = urllib.request.urlopen(url)
|
|
|
|
try:
|
|
|
|
size = int(dict(request.info())['content-length'].strip())
|
|
|
|
except KeyError:
|
|
|
|
try:
|
|
|
|
size = int(dict(request.info())['Content-Length'].strip())
|
|
|
|
except KeyError:
|
|
|
|
size = 0
|
|
|
|
# Download the document
|
|
|
|
doc = b""
|
|
|
|
doc_size = 0
|
|
|
|
while True:
|
|
|
|
buf = request.read(1024)
|
|
|
|
if buf:
|
|
|
|
doc += buf
|
|
|
|
doc_size += len(buf)
|
|
|
|
if size != 0:
|
|
|
|
# Write progress bar on stdout
|
|
|
|
done = int(50 * doc_size / size)
|
|
|
|
sys.stdout.write("\r[%s%s]" %
|
|
|
|
('='*done, ' '*(50-done)))
|
|
|
|
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
|
|
|
sys.stdout.flush()
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
# Fetch content type
|
|
|
|
contenttype = None
|
|
|
|
contenttype_req = None
|
|
|
|
try:
|
|
|
|
contenttype_req = dict(request.info())['content-type']
|
|
|
|
except KeyError:
|
|
|
|
try:
|
|
|
|
contenttype_req = dict(request.info())['Content-Type']
|
|
|
|
except KeyError:
|
|
|
|
return None
|
|
|
|
if 'pdf' in contenttype_req:
|
|
|
|
contenttype = 'pdf'
|
|
|
|
elif 'djvu' in contenttype_req:
|
|
|
|
contenttype = 'djvu'
|
|
|
|
|
|
|
|
# Check content type and status code are ok
|
|
|
|
if request.getcode() != 200 or contenttype is None:
|
|
|
|
# Else, try with the next available proxy
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Return a tuple of the downloaded content and the content-type
|
|
|
|
return (doc, contenttype)
|
|
|
|
# If an exception occurred, continue with next available proxy
|
|
|
|
except (urllib.error.URLError, socket.error, ValueError):
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def download(url, proxies=None):
|
2016-01-10 18:19:30 +01:00
|
|
|
"""
|
|
|
|
Download a PDF or DJVU document from a url, eventually using proxies.
|
|
|
|
|
|
|
|
:params url: The URL to the PDF/DJVU document to fetch.
|
|
|
|
:params proxies: An optional list of proxies to use. Proxies will be \
|
|
|
|
used sequentially. Proxies should be a list of proxy strings. \
|
2016-02-17 16:04:36 +01:00
|
|
|
Do not forget to include ``""`` (empty string) in the list if \
|
|
|
|
you want to try direct fetching without any proxy.
|
2016-01-10 18:19:30 +01:00
|
|
|
|
|
|
|
:returns: A tuple of the raw content of the downloaded data and its \
|
|
|
|
associated content-type. Returns ``(None, None)`` if it was \
|
|
|
|
unable to download the document.
|
2016-01-20 22:35:43 +01:00
|
|
|
|
2016-01-30 16:51:30 +01:00
|
|
|
>>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP
|
2016-01-10 18:19:30 +01:00
|
|
|
"""
|
2016-02-17 16:04:36 +01:00
|
|
|
# Handle default argument
|
|
|
|
if proxies is None:
|
|
|
|
proxies = [""]
|
|
|
|
|
2016-01-10 18:19:30 +01:00
|
|
|
# Loop over all available connections
|
|
|
|
for proxy in proxies:
|
|
|
|
# Handle no proxy case
|
2016-02-17 16:04:36 +01:00
|
|
|
if proxy == "":
|
2016-01-10 18:19:30 +01:00
|
|
|
socket.socket = DEFAULT_SOCKET
|
|
|
|
# Handle SOCKS proxy
|
|
|
|
elif proxy.startswith('socks'):
|
|
|
|
if proxy[5] == '4':
|
|
|
|
proxy_type = socks.SOCKS4
|
|
|
|
else:
|
|
|
|
proxy_type = socks.SOCKS5
|
|
|
|
proxy = proxy[proxy.find('://') + 3:]
|
|
|
|
try:
|
|
|
|
proxy, port = proxy.split(':')
|
|
|
|
except ValueError:
|
|
|
|
port = None
|
|
|
|
socks.set_default_proxy(proxy_type, proxy, port)
|
|
|
|
socket.socket = socks.socksocket
|
|
|
|
# Handle generic HTTP proxy
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
proxy, port = proxy.split(':')
|
|
|
|
except ValueError:
|
|
|
|
port = None
|
|
|
|
socks.set_default_proxy(socks.HTTP, proxy, port)
|
|
|
|
socket.socket = socks.socksocket
|
|
|
|
|
2016-02-17 16:04:36 +01:00
|
|
|
downloaded = _download_helper(url)
|
|
|
|
if downloaded is not None:
|
|
|
|
return downloaded
|
2016-01-10 18:19:30 +01:00
|
|
|
|
|
|
|
# In case of running out of proxies, return (None, None)
|
|
|
|
return (None, None)
|