Readd downloader code

2016-01-10 18:19:30 +01:00 · 2016-01-10 18:19:30 +01:00 · 7a281528e3
commit 7a281528e3
parent 4a89c8c136
1 changed files with 110 additions and 0 deletions
--- a/libbmc/fetcher.py
+++ b/libbmc/fetcher.py
@ -0,0 +1,110 @@
+"""
+This file contains functions to download locally some papers, eventually using
+a proxy.
+"""
+import socket
+import socks
+import sys
+import urllib
+
+
+# Default socket to use, if no proxy is used
+DEFAULT_SOCKET = socket.socket
+
+
+def download(url, proxies=[None]):
+    """
+    Download a PDF or DJVU document from a url, eventually using proxies.
+
+    :params url: The URL to the PDF/DJVU document to fetch.
+    :params proxies: An optional list of proxies to use. Proxies will be \
+            used sequentially. Proxies should be a list of proxy strings. \
+            Do not forget to include ``None`` in the list if you want to try \
+            direct fetching without any proxy.
+
+    :returns: A tuple of the raw content of the downloaded data and its \
+            associated content-type. Returns ``(None, None)`` if it was \
+            unable to download the document.
+    """
+    # Loop over all available connections
+    for proxy in proxies:
+        # Handle no proxy case
+        if proxy is None:
+            socket.socket = DEFAULT_SOCKET
+        # Handle SOCKS proxy
+        elif proxy.startswith('socks'):
+            if proxy[5] == '4':
+                proxy_type = socks.SOCKS4
+            else:
+                proxy_type = socks.SOCKS5
+            proxy = proxy[proxy.find('://') + 3:]
+            try:
+                proxy, port = proxy.split(':')
+            except ValueError:
+                port = None
+            socks.set_default_proxy(proxy_type, proxy, port)
+            socket.socket = socks.socksocket
+        # Handle generic HTTP proxy
+        else:
+            try:
+                proxy, port = proxy.split(':')
+            except ValueError:
+                port = None
+            socks.set_default_proxy(socks.HTTP, proxy, port)
+            socket.socket = socks.socksocket
+
+        # Try to fetch the URL using the current proxy
+        try:
+            r = urllib.request.urlopen(url)
+            try:
+                size = int(dict(r.info())['content-length'].strip())
+            except KeyError:
+                try:
+                    size = int(dict(r.info())['Content-Length'].strip())
+                except KeyError:
+                    size = 0
+            # Download the document
+            dl = b""
+            dl_size = 0
+            while True:
+                buf = r.read(1024)
+                if buf:
+                    dl += buf
+                    dl_size += len(buf)
+                    if size != 0:
+                        # Write progress bar on stdout
+                        done = int(50 * dl_size / size)
+                        sys.stdout.write("\r[%s%s]" %
+                                         ('='*done, ' '*(50-done)))
+                        sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
+                        sys.stdout.flush()
+                else:
+                    break
+            # Fetch content type
+            contenttype = False
+            contenttype_req = None
+            try:
+                contenttype_req = dict(r.info())['content-type']
+            except KeyError:
+                try:
+                    contenttype_req = dict(r.info())['Content-Type']
+                except KeyError:
+                    continue
+            if 'pdf' in contenttype_req:
+                contenttype = 'pdf'
+            elif 'djvu' in contenttype_req:
+                contenttype = 'djvu'
+
+            # Check content type and status code are ok
+            if r.getcode() != 200 or contenttype is False:
+                # Else, try with the next available proxy
+                continue
+
+            # Return a tuple of the downloaded content and the content-type
+            return (dl, contenttype)
+        # If an exception occurred, continue with next available proxy
+        except (urllib.error.URLError, socket.error, ValueError):
+            continue
+
+    # In case of running out of proxies, return (None, None)
+    return (None, None)