bmc/libbmc/fetcher.py

355 lines
12 KiB
Python
Raw Normal View History

2014-05-26 16:12:21 +02:00
# -*- coding: utf8 -*-
# -----------------------------------------------------------------------------
# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice
# you can do whatever you want with this stuff (and you can also do whatever
# you want with this stuff without retaining it, but that's not cool...). If we
# meet some day, and you think this stuff is worth it, you can buy me a
# <del>beer</del> soda in return.
# Phyks
# -----------------------------------------------------------------------------
2014-07-08 21:37:13 +02:00
import isbnlib
2014-04-28 22:23:05 +02:00
import re
2014-08-02 23:34:34 +02:00
import socket
import socks
2014-04-28 22:23:05 +02:00
import subprocess
2014-05-14 23:07:06 +02:00
import sys
2014-08-02 23:34:34 +02:00
try:
# For Python 3.0 and later
2014-08-03 00:17:01 +02:00
from urllib.request import urlopen, Request
2014-08-02 23:34:34 +02:00
from urllib.error import URLError
except ImportError:
# Fall back to Python 2's urllib2
2014-08-03 00:17:01 +02:00
from urllib2 import urlopen, Request, URLError
2014-05-02 00:07:49 +02:00
import arxiv2bib as arxiv_metadata
2014-08-03 00:17:01 +02:00
import libbmc.tools as tools
from bibtexparser.bparser import BibTexParser
2014-08-03 00:09:07 +02:00
from libbmc.config import Config
2013-01-22 02:11:12 +01:00
2014-04-26 18:43:25 +02:00
2014-06-30 23:02:30 +02:00
config = Config()
2014-08-03 00:09:07 +02:00
default_socket = socket.socket
2014-08-03 21:37:34 +02:00
try:
stdout_encoding = sys.stdout.encoding
assert(stdout_encoding is not None)
except (AttributeError, AssertionError):
2014-08-03 21:20:48 +02:00
stdout_encoding = 'UTF-8'
2014-06-30 23:02:30 +02:00
2014-04-28 22:23:05 +02:00
def download(url):
"""Download url tofile
2014-04-26 18:43:25 +02:00
2014-04-28 22:23:05 +02:00
Check that it is a valid pdf or djvu file. Tries all the
available proxies sequentially. Returns the raw content of the file, or
false if it could not be downloaded.
"""
2014-06-30 00:19:38 +02:00
for proxy in config.get("proxies"):
2014-08-02 23:34:34 +02:00
if proxy.startswith('socks'):
if proxy[5] == '4':
proxy_type = socks.SOCKS4
else:
proxy_type = socks.SOCKS5
proxy = proxy[proxy.find('://')+3:]
try:
proxy, port = proxy.split(':')
except ValueError:
port = None
socks.set_default_proxy(proxy_type, proxy, port)
2014-08-03 00:09:07 +02:00
socket.socket = socks.socksocket
elif proxy == '':
socket.socket = default_socket
else:
2014-08-02 23:34:34 +02:00
try:
proxy, port = proxy.split(':')
except ValueError:
port = None
socks.set_default_proxy(socks.HTTP, proxy, port)
2014-08-03 00:09:07 +02:00
socket.socket = socks.socksocket
2014-04-26 18:43:25 +02:00
try:
2014-08-02 23:34:34 +02:00
r = urlopen(url)
2014-08-03 21:20:48 +02:00
try:
size = int(dict(r.info())['Content-Length'].strip())
except KeyError:
size = 1
dl = b""
2014-05-14 23:07:06 +02:00
dl_size = 0
2014-08-02 23:34:34 +02:00
while True:
buf = r.read(1024)
2014-05-14 23:07:06 +02:00
if buf:
dl += buf
dl_size += len(buf)
done = int(50 * dl_size / size)
2014-05-17 17:23:56 +02:00
sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done)))
2014-05-14 23:07:06 +02:00
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
2014-08-02 23:34:34 +02:00
else:
break
contenttype = False
2014-08-03 21:20:48 +02:00
try:
if 'pdf' in dict(r.info())['Content-Type']:
contenttype = 'pdf'
elif 'djvu' in dict(r.info())['Content-Type']:
contenttype = 'djvu'
except KeyError:
pass
2013-02-22 00:13:22 +01:00
2014-08-02 23:34:34 +02:00
if r.getcode() != 200 or contenttype is False:
2014-04-26 18:43:25 +02:00
continue
2013-05-11 11:57:28 +02:00
2014-05-14 23:07:06 +02:00
return dl, contenttype
2014-05-26 16:50:58 +02:00
except ValueError:
tools.warning("Invalid URL")
return False, None
2014-08-02 23:34:34 +02:00
except URLError:
2014-05-09 23:37:17 +02:00
tools.warning("Unable to get "+url+" using proxy "+proxy+". It " +
"may not be available.")
2014-04-26 18:43:25 +02:00
continue
return False, None
2014-04-28 22:23:05 +02:00
isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
2014-05-17 17:23:56 +02:00
re.IGNORECASE)
2014-04-29 21:55:35 +02:00
2014-05-26 16:50:58 +02:00
2014-04-28 22:23:05 +02:00
def findISBN(src):
"""Search for a valid ISBN in src.
Returns the ISBN or false if not found or an error occurred."""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
2014-04-28 22:23:05 +02:00
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
else:
return False
2014-04-30 00:54:15 +02:00
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
2014-04-30 00:54:15 +02:00
extractISBN = isbn_re.search(extractfull.lower().replace('&#338;',
'-'))
if extractISBN:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
2014-04-28 22:23:05 +02:00
# Error happened
tools.warning(err)
2014-04-28 22:23:05 +02:00
return False
2014-04-28 22:23:05 +02:00
cleanISBN = False
# Clean ISBN is the ISBN number without separators
if extractISBN:
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
return cleanISBN
def isbn2Bib(isbn):
"""Tries to get bibtex entry from an ISBN number"""
# Default merges results from worldcat.org and google books
2014-05-26 16:12:21 +02:00
try:
2014-07-08 21:37:13 +02:00
return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,
'default'))
except (isbnlib.ISBNLibException, isbnlib.ISBNToolsException, TypeError):
2014-05-26 16:12:21 +02:00
return ''
2014-04-28 22:23:05 +02:00
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
2014-04-29 21:55:35 +02:00
clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d')
2014-04-28 22:23:05 +02:00
def findDOI(src):
"""Search for a valid DOI in src.
Returns the DOI or False if not found or an error occurred.
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
2014-05-26 16:12:21 +02:00
extractfull = ''
2014-04-30 00:54:15 +02:00
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractDOI = doi_re.search(extractfull.lower().replace('&#338;', '-'))
if not extractDOI:
# PNAS fix
2014-06-30 00:19:38 +02:00
extractDOI = doi_pnas_re.search(extractfull.
lower().
replace('pnas', '/pnas'))
if not extractDOI:
# JSB fix
extractDOI = doi_jsb_re.search(extractfull.lower())
if extractDOI:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
2014-04-28 22:23:05 +02:00
# Error happened
tools.warning(err)
2014-04-28 22:23:05 +02:00
return False
cleanDOI = False
if extractDOI:
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
2014-04-29 21:55:35 +02:00
if clean_doi_re.search(cleanDOI):
2014-04-28 22:23:05 +02:00
cleanDOI = cleanDOI[1:]
# FABSE J fix
2014-04-29 21:55:35 +02:00
if clean_doi_fabse_re.search(cleanDOI):
2014-04-28 22:23:05 +02:00
cleanDOI = cleanDOI[:20]
# Second JCB fix
2014-04-29 21:55:35 +02:00
if clean_doi_jcb_re.search(cleanDOI):
2014-04-28 22:23:05 +02:00
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
2014-04-29 21:55:35 +02:00
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
2014-04-28 22:23:05 +02:00
reps = {'.': 'A', '-': '0'}
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return cleanDOI
def doi2Bib(doi):
"""Returns a bibTeX string of metadata for a given DOI.
2014-04-28 22:23:05 +02:00
From : https://gist.github.com/jrsmith3/5513926
"""
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
2014-08-03 00:17:01 +02:00
req = Request(url, headers=headers)
2014-04-28 22:23:05 +02:00
try:
2014-08-03 00:17:01 +02:00
r = urlopen(req)
2014-04-28 22:23:05 +02:00
2014-08-03 21:20:48 +02:00
try:
if dict(r.info())['Content-Type'] == 'application/x-bibtex':
return r.read()
else:
return ''
except KeyError:
2014-04-28 22:23:05 +02:00
return ''
2014-08-03 00:17:01 +02:00
except URLError:
2014-04-28 22:23:05 +02:00
tools.warning('Unable to contact remote server to get the bibtex ' +
'entry for doi '+doi)
return ''
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
def findArXivId(src):
"""Searches for a valid arXiv id in src.
Returns the arXiv id or False if not found or an error occurred.
From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
2014-05-26 16:12:21 +02:00
extractfull = ''
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractID = arXiv_re.search(extractfull)
if extractID:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
elif extractID is not None:
return extractID.group(1)
else:
return False
def arXiv2Bib(arxiv):
"""Returns bibTeX string of metadata for a given arXiv id
arxiv is an arxiv id
"""
2014-05-02 00:07:49 +02:00
bibtex = arxiv_metadata.arxiv2bib([arxiv])
for bib in bibtex:
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
continue
else:
fetched_bibtex = BibTexParser(bib.bibtex())
fetched_bibtex = fetched_bibtex.get_entry_dict()
2014-08-03 21:20:48 +02:00
fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]
try:
del(fetched_bibtex['file'])
2014-07-01 20:31:19 +02:00
except KeyError:
pass
return tools.parsed2Bibtex(fetched_bibtex)
2014-05-26 16:12:21 +02:00
return ''
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
def findHALId(src):
"""Searches for a valid HAL id in src
Returns a tuple of the HAL id and the version
or False if not found or an error occurred.
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractID = HAL_re.search(extractfull)
if extractID:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
else:
return extractID.group(1), extractID.group(2)