bmc/libbmc/fetcher.py

361 lines
13 KiB
Python
Raw Normal View History

2014-05-26 16:12:21 +02:00
# -*- coding: utf8 -*-
# -----------------------------------------------------------------------------
# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice
# you can do whatever you want with this stuff (and you can also do whatever
# you want with this stuff without retaining it, but that's not cool...). If we
# meet some day, and you think this stuff is worth it, you can buy me a
# <del>beer</del> soda in return.
# Phyks
# -----------------------------------------------------------------------------
2014-07-08 21:37:13 +02:00
import isbnlib
2014-04-28 22:23:05 +02:00
import re
2014-08-02 23:34:34 +02:00
import socket
import socks
2014-04-28 22:23:05 +02:00
import subprocess
2014-05-14 23:07:06 +02:00
import sys
2014-08-02 23:34:34 +02:00
try:
# For Python 3.0 and later
2014-08-03 00:17:01 +02:00
from urllib.request import urlopen, Request
2014-08-02 23:34:34 +02:00
from urllib.error import URLError
except ImportError:
# Fall back to Python 2's urllib2
2015-09-05 16:55:14 +02:00
from urllib2 import urlopen, Request, URLError
2014-05-02 00:07:49 +02:00
import arxiv2bib as arxiv_metadata
2014-08-03 00:17:01 +02:00
import libbmc.tools as tools
2014-12-03 12:54:24 +01:00
import bibtexparser
2014-08-03 00:09:07 +02:00
from libbmc.config import Config
2013-01-22 02:11:12 +01:00
2014-04-26 18:43:25 +02:00
2014-06-30 23:02:30 +02:00
config = Config()
2014-08-03 00:09:07 +02:00
default_socket = socket.socket
2014-08-03 21:37:34 +02:00
try:
stdout_encoding = sys.stdout.encoding
assert(stdout_encoding is not None)
except (AttributeError, AssertionError):
2014-08-03 21:20:48 +02:00
stdout_encoding = 'UTF-8'
2014-06-30 23:02:30 +02:00
2014-04-28 22:23:05 +02:00
def download(url):
"""Download url tofile
2014-04-26 18:43:25 +02:00
2014-04-28 22:23:05 +02:00
Check that it is a valid pdf or djvu file. Tries all the
available proxies sequentially. Returns the raw content of the file, or
false if it could not be downloaded.
"""
2014-06-30 00:19:38 +02:00
for proxy in config.get("proxies"):
2014-08-02 23:34:34 +02:00
if proxy.startswith('socks'):
if proxy[5] == '4':
proxy_type = socks.SOCKS4
else:
proxy_type = socks.SOCKS5
proxy = proxy[proxy.find('://')+3:]
try:
proxy, port = proxy.split(':')
except ValueError:
port = None
socks.set_default_proxy(proxy_type, proxy, port)
2014-08-03 00:09:07 +02:00
socket.socket = socks.socksocket
elif proxy == '':
socket.socket = default_socket
else:
2014-08-02 23:34:34 +02:00
try:
proxy, port = proxy.split(':')
except ValueError:
port = None
socks.set_default_proxy(socks.HTTP, proxy, port)
2014-08-03 00:09:07 +02:00
socket.socket = socks.socksocket
2014-04-26 18:43:25 +02:00
try:
2014-08-02 23:34:34 +02:00
r = urlopen(url)
2014-08-03 21:20:48 +02:00
try:
size = int(dict(r.info())['content-length'].strip())
2014-08-03 21:20:48 +02:00
except KeyError:
try:
size = int(dict(r.info())['Content-Length'].strip())
except KeyError:
2014-08-03 23:34:17 +02:00
size = 0
2014-08-03 21:20:48 +02:00
dl = b""
2014-05-14 23:07:06 +02:00
dl_size = 0
2014-08-02 23:34:34 +02:00
while True:
buf = r.read(1024)
2014-05-14 23:07:06 +02:00
if buf:
dl += buf
dl_size += len(buf)
2014-08-03 23:34:17 +02:00
if size != 0:
done = int(50 * dl_size / size)
sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done)))
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
2014-08-02 23:34:34 +02:00
else:
break
contenttype = False
contenttype_req = None
2014-08-03 21:20:48 +02:00
try:
contenttype_req = dict(r.info())['content-type']
except KeyError:
try:
contenttype_req = dict(r.info())['Content-Type']
except KeyError:
continue
try:
if 'pdf' in contenttype_req:
2014-08-03 21:20:48 +02:00
contenttype = 'pdf'
elif 'djvu' in contenttype_req:
2014-08-03 21:20:48 +02:00
contenttype = 'djvu'
except KeyError:
pass
2013-02-22 00:13:22 +01:00
2014-08-02 23:34:34 +02:00
if r.getcode() != 200 or contenttype is False:
2014-04-26 18:43:25 +02:00
continue
2013-05-11 11:57:28 +02:00
2014-05-14 23:07:06 +02:00
return dl, contenttype
2014-05-26 16:50:58 +02:00
except ValueError:
tools.warning("Invalid URL")
return False, None
except (URLError, socket.error):
if proxy != "":
proxy_txt = "using proxy "+proxy
else:
proxy_txt = "without using any proxy"
tools.warning("Unable to get "+url+" "+proxy_txt+". It " +
"may not be available at the moment.")
2014-04-26 18:43:25 +02:00
continue
return False, None
2014-04-28 22:23:05 +02:00
isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
2014-05-17 17:23:56 +02:00
re.IGNORECASE)
2014-04-29 21:55:35 +02:00
2014-05-26 16:50:58 +02:00
2014-04-28 22:23:05 +02:00
def findISBN(src):
"""Search for a valid ISBN in src.
Returns the ISBN or false if not found or an error occurred."""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
2014-04-28 22:23:05 +02:00
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
else:
return False
2014-04-30 00:54:15 +02:00
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
2014-04-30 00:54:15 +02:00
extractISBN = isbn_re.search(extractfull.lower().replace('&#338;',
'-'))
if extractISBN:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
2014-04-28 22:23:05 +02:00
# Error happened
tools.warning(err)
2014-04-28 22:23:05 +02:00
return False
2014-04-28 22:23:05 +02:00
cleanISBN = False
# Clean ISBN is the ISBN number without separators
if extractISBN:
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
return cleanISBN
def isbn2Bib(isbn):
"""Tries to get bibtex entry from an ISBN number"""
# Default merges results from worldcat.org and google books
2014-05-26 16:12:21 +02:00
try:
2014-07-08 21:37:13 +02:00
return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,
'default'))
2015-06-06 16:28:39 +02:00
except (isbnlib.ISBNLibException, TypeError):
2014-05-26 16:12:21 +02:00
return ''
2014-04-28 22:23:05 +02:00
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
2014-04-29 21:55:35 +02:00
clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d')
2014-11-04 21:11:19 +01:00
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
2014-04-29 21:55:35 +02:00
2014-11-04 21:11:19 +01:00
def findArticleID(src, only=["DOI", "arXiv"]):
"""Search for a valid article ID (DOI or ArXiv) in src.
2014-04-28 22:23:05 +02:00
2014-11-04 21:11:19 +01:00
Returns a tuple (type, first matching ID) or False if not found
or an error occurred.
2014-04-28 22:23:05 +02:00
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
2014-11-04 21:11:19 +01:00
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
2014-04-28 22:23:05 +02:00
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
2014-11-04 21:11:19 +01:00
return (False, False)
2014-05-26 16:12:21 +02:00
extractfull = ''
2014-11-04 21:11:19 +01:00
extract_type = False
extractID = None
2014-04-30 00:54:15 +02:00
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
2014-11-04 21:11:19 +01:00
# Try to extract DOI
if "DOI" in only:
extractID = doi_re.search(extractfull.lower().replace('&#338;', '-'))
if not extractID:
# PNAS fix
extractID = doi_pnas_re.search(extractfull.
lower().
replace('pnas', '/pnas'))
if not extractID:
# JSB fix
extractID = doi_jsb_re.search(extractfull.lower())
if extractID:
extract_type = "DOI"
totext.terminate()
# Try to extract arXiv
if "arXiv" in only:
tmp_extractID = arXiv_re.search(extractfull)
if tmp_extractID:
if not extractID or extractID.start(0) > tmp_extractID.start(1):
# Only use arXiv id if it is before the DOI in the pdf
extractID = tmp_extractID
extract_type = "arXiv"
totext.terminate()
if extract_type is not False:
break
err = totext.communicate()[1]
if totext.returncode > 0:
2014-04-28 22:23:05 +02:00
# Error happened
tools.warning(err)
2014-11-04 21:11:19 +01:00
return (False, False)
2014-04-28 22:23:05 +02:00
2014-11-04 21:11:19 +01:00
if extractID is not None and extract_type == "DOI":
# If DOI extracted, clean it and return it
cleanDOI = False
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
2014-04-29 21:55:35 +02:00
if clean_doi_re.search(cleanDOI):
2014-04-28 22:23:05 +02:00
cleanDOI = cleanDOI[1:]
# FABSE J fix
2014-04-29 21:55:35 +02:00
if clean_doi_fabse_re.search(cleanDOI):
2014-04-28 22:23:05 +02:00
cleanDOI = cleanDOI[:20]
# Second JCB fix
2014-04-29 21:55:35 +02:00
if clean_doi_jcb_re.search(cleanDOI):
2014-04-28 22:23:05 +02:00
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
2014-04-29 21:55:35 +02:00
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
2014-04-28 22:23:05 +02:00
reps = {'.': 'A', '-': '0'}
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
2014-11-04 21:11:19 +01:00
return ("DOI", cleanDOI)
elif extractID is not None and extract_type == "arXiv":
# If arXiv id is extracted, return it
return ("arXiv", extractID.group(1))
return (False, False)
2014-04-28 22:23:05 +02:00
def doi2Bib(doi):
"""Returns a bibTeX string of metadata for a given DOI.
2014-04-28 22:23:05 +02:00
From : https://gist.github.com/jrsmith3/5513926
"""
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
2014-08-03 00:17:01 +02:00
req = Request(url, headers=headers)
2014-04-28 22:23:05 +02:00
try:
2014-08-03 00:17:01 +02:00
r = urlopen(req)
2014-04-28 22:23:05 +02:00
2014-08-03 21:20:48 +02:00
try:
if dict(r.info())['content-type'] == 'application/x-bibtex':
2014-08-03 21:52:01 +02:00
return r.read().decode('utf-8')
2014-08-03 21:20:48 +02:00
else:
return ''
except KeyError:
try:
if dict(r.info())['Content-Type'] == 'application/x-bibtex':
return r.read().decode('utf-8')
else:
return ''
except KeyError:
return ''
2015-06-08 20:02:09 +02:00
except:
2014-04-28 22:23:05 +02:00
tools.warning('Unable to contact remote server to get the bibtex ' +
'entry for doi '+doi)
return ''
def arXiv2Bib(arxiv):
"""Returns bibTeX string of metadata for a given arXiv id
arxiv is an arxiv id
"""
2014-05-02 00:07:49 +02:00
bibtex = arxiv_metadata.arxiv2bib([arxiv])
for bib in bibtex:
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
continue
else:
2014-12-03 12:54:24 +01:00
fetched_bibtex = bibtexparser.loads(bib.bibtex())
fetched_bibtex = fetched_bibtex.entries_dict
2014-08-03 21:20:48 +02:00
fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]
try:
del(fetched_bibtex['file'])
2014-07-01 20:31:19 +02:00
except KeyError:
pass
return tools.parsed2Bibtex(fetched_bibtex)
2014-05-26 16:12:21 +02:00
return ''
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
def findHALId(src):
"""Searches for a valid HAL id in src
Returns a tuple of the HAL id and the version
or False if not found or an error occurred.
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
while totext.poll() is None:
2014-08-03 21:20:48 +02:00
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractID = HAL_re.search(extractfull)
if extractID:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
else:
return extractID.group(1), extractID.group(2)