2014-05-26 16:12:21 +02:00
|
|
|
# -*- coding: utf8 -*-
|
|
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
|
|
|
|
# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice
|
|
|
|
# you can do whatever you want with this stuff (and you can also do whatever
|
|
|
|
# you want with this stuff without retaining it, but that's not cool...). If we
|
|
|
|
# meet some day, and you think this stuff is worth it, you can buy me a
|
|
|
|
# <del>beer</del> soda in return.
|
|
|
|
# Phyks
|
|
|
|
# -----------------------------------------------------------------------------
|
|
|
|
|
2014-04-26 11:52:19 +02:00
|
|
|
|
2014-07-08 21:37:13 +02:00
|
|
|
import isbnlib
|
2014-04-28 22:23:05 +02:00
|
|
|
import re
|
2014-08-02 23:34:34 +02:00
|
|
|
import socket
|
|
|
|
import socks
|
2014-04-28 22:23:05 +02:00
|
|
|
import subprocess
|
2014-05-14 23:07:06 +02:00
|
|
|
import sys
|
2014-08-02 23:34:34 +02:00
|
|
|
try:
|
|
|
|
# For Python 3.0 and later
|
2014-08-03 00:17:01 +02:00
|
|
|
from urllib.request import urlopen, Request
|
2014-08-02 23:34:34 +02:00
|
|
|
from urllib.error import URLError
|
|
|
|
except ImportError:
|
|
|
|
# Fall back to Python 2's urllib2
|
2015-09-05 16:55:14 +02:00
|
|
|
from urllib2 import urlopen, Request, URLError
|
2014-05-02 00:07:49 +02:00
|
|
|
import arxiv2bib as arxiv_metadata
|
2014-08-03 00:17:01 +02:00
|
|
|
import libbmc.tools as tools
|
2014-12-03 12:54:24 +01:00
|
|
|
import bibtexparser
|
2014-08-03 00:09:07 +02:00
|
|
|
from libbmc.config import Config
|
2013-01-22 02:11:12 +01:00
|
|
|
|
2014-04-26 18:43:25 +02:00
|
|
|
|
2014-06-30 23:02:30 +02:00
|
|
|
config = Config()
|
2014-08-03 00:09:07 +02:00
|
|
|
default_socket = socket.socket
|
2014-08-03 21:37:34 +02:00
|
|
|
try:
|
|
|
|
stdout_encoding = sys.stdout.encoding
|
|
|
|
assert(stdout_encoding is not None)
|
|
|
|
except (AttributeError, AssertionError):
|
2014-08-03 21:20:48 +02:00
|
|
|
stdout_encoding = 'UTF-8'
|
2014-06-30 23:02:30 +02:00
|
|
|
|
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
def download(url):
|
|
|
|
"""Download url tofile
|
2014-04-26 18:43:25 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
Check that it is a valid pdf or djvu file. Tries all the
|
|
|
|
available proxies sequentially. Returns the raw content of the file, or
|
|
|
|
false if it could not be downloaded.
|
|
|
|
"""
|
2014-06-30 00:19:38 +02:00
|
|
|
for proxy in config.get("proxies"):
|
2014-08-02 23:34:34 +02:00
|
|
|
if proxy.startswith('socks'):
|
|
|
|
if proxy[5] == '4':
|
|
|
|
proxy_type = socks.SOCKS4
|
|
|
|
else:
|
|
|
|
proxy_type = socks.SOCKS5
|
|
|
|
proxy = proxy[proxy.find('://')+3:]
|
|
|
|
try:
|
|
|
|
proxy, port = proxy.split(':')
|
|
|
|
except ValueError:
|
|
|
|
port = None
|
|
|
|
socks.set_default_proxy(proxy_type, proxy, port)
|
2014-08-03 00:09:07 +02:00
|
|
|
socket.socket = socks.socksocket
|
|
|
|
elif proxy == '':
|
|
|
|
socket.socket = default_socket
|
|
|
|
else:
|
2014-08-02 23:34:34 +02:00
|
|
|
try:
|
|
|
|
proxy, port = proxy.split(':')
|
|
|
|
except ValueError:
|
|
|
|
port = None
|
|
|
|
socks.set_default_proxy(socks.HTTP, proxy, port)
|
2014-08-03 00:09:07 +02:00
|
|
|
socket.socket = socks.socksocket
|
2014-04-26 18:43:25 +02:00
|
|
|
try:
|
2014-08-02 23:34:34 +02:00
|
|
|
r = urlopen(url)
|
2014-08-03 21:20:48 +02:00
|
|
|
try:
|
2014-08-03 22:59:00 +02:00
|
|
|
size = int(dict(r.info())['content-length'].strip())
|
2014-08-03 21:20:48 +02:00
|
|
|
except KeyError:
|
2014-08-03 22:59:00 +02:00
|
|
|
try:
|
|
|
|
size = int(dict(r.info())['Content-Length'].strip())
|
|
|
|
except KeyError:
|
2014-08-03 23:34:17 +02:00
|
|
|
size = 0
|
2014-08-03 21:20:48 +02:00
|
|
|
dl = b""
|
2014-05-14 23:07:06 +02:00
|
|
|
dl_size = 0
|
2014-08-02 23:34:34 +02:00
|
|
|
while True:
|
|
|
|
buf = r.read(1024)
|
2014-05-14 23:07:06 +02:00
|
|
|
if buf:
|
|
|
|
dl += buf
|
|
|
|
dl_size += len(buf)
|
2014-08-03 23:34:17 +02:00
|
|
|
if size != 0:
|
|
|
|
done = int(50 * dl_size / size)
|
|
|
|
sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done)))
|
|
|
|
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
|
|
|
sys.stdout.flush()
|
2014-08-02 23:34:34 +02:00
|
|
|
else:
|
|
|
|
break
|
2014-04-26 23:26:25 +02:00
|
|
|
contenttype = False
|
2014-08-03 22:59:00 +02:00
|
|
|
contenttype_req = None
|
2014-08-03 21:20:48 +02:00
|
|
|
try:
|
2014-08-03 22:59:00 +02:00
|
|
|
contenttype_req = dict(r.info())['content-type']
|
|
|
|
except KeyError:
|
|
|
|
try:
|
|
|
|
contenttype_req = dict(r.info())['Content-Type']
|
|
|
|
except KeyError:
|
|
|
|
continue
|
|
|
|
try:
|
|
|
|
if 'pdf' in contenttype_req:
|
2014-08-03 21:20:48 +02:00
|
|
|
contenttype = 'pdf'
|
2014-08-03 22:59:00 +02:00
|
|
|
elif 'djvu' in contenttype_req:
|
2014-08-03 21:20:48 +02:00
|
|
|
contenttype = 'djvu'
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2013-02-22 00:13:22 +01:00
|
|
|
|
2014-08-02 23:34:34 +02:00
|
|
|
if r.getcode() != 200 or contenttype is False:
|
2014-04-26 18:43:25 +02:00
|
|
|
continue
|
2013-05-11 11:57:28 +02:00
|
|
|
|
2014-05-14 23:07:06 +02:00
|
|
|
return dl, contenttype
|
2014-05-26 16:50:58 +02:00
|
|
|
except ValueError:
|
|
|
|
tools.warning("Invalid URL")
|
2014-06-30 11:05:03 +02:00
|
|
|
return False, None
|
2014-11-04 21:51:44 +01:00
|
|
|
except (URLError, socket.error):
|
|
|
|
if proxy != "":
|
|
|
|
proxy_txt = "using proxy "+proxy
|
|
|
|
else:
|
|
|
|
proxy_txt = "without using any proxy"
|
|
|
|
tools.warning("Unable to get "+url+" "+proxy_txt+". It " +
|
|
|
|
"may not be available at the moment.")
|
2014-04-26 18:43:25 +02:00
|
|
|
continue
|
2014-06-30 11:05:03 +02:00
|
|
|
return False, None
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
|
2014-06-29 23:02:44 +02:00
|
|
|
isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
|
2014-05-17 17:23:56 +02:00
|
|
|
re.IGNORECASE)
|
2014-04-29 21:55:35 +02:00
|
|
|
|
2014-05-26 16:50:58 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
def findISBN(src):
|
|
|
|
"""Search for a valid ISBN in src.
|
|
|
|
|
|
|
|
Returns the ISBN or false if not found or an error occurred."""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
2014-04-30 00:36:15 +02:00
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
bufsize=1)
|
2014-04-28 22:23:05 +02:00
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
2014-04-30 00:36:15 +02:00
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
bufsize=1)
|
2014-05-02 00:33:09 +02:00
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2014-04-30 00:54:15 +02:00
|
|
|
while totext.poll() is None:
|
2014-08-03 21:20:48 +02:00
|
|
|
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
2014-04-30 00:54:15 +02:00
|
|
|
extractISBN = isbn_re.search(extractfull.lower().replace('Œ',
|
|
|
|
'-'))
|
2014-04-30 00:36:15 +02:00
|
|
|
if extractISBN:
|
|
|
|
totext.terminate()
|
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
2014-04-28 22:23:05 +02:00
|
|
|
# Error happened
|
2014-04-30 00:36:15 +02:00
|
|
|
tools.warning(err)
|
2014-04-28 22:23:05 +02:00
|
|
|
return False
|
2014-04-30 00:36:15 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanISBN = False
|
|
|
|
# Clean ISBN is the ISBN number without separators
|
|
|
|
if extractISBN:
|
|
|
|
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
|
|
|
|
return cleanISBN
|
|
|
|
|
|
|
|
|
|
|
|
def isbn2Bib(isbn):
|
2014-05-01 19:46:04 +02:00
|
|
|
"""Tries to get bibtex entry from an ISBN number"""
|
2014-05-11 19:29:42 +02:00
|
|
|
# Default merges results from worldcat.org and google books
|
2014-05-26 16:12:21 +02:00
|
|
|
try:
|
2014-07-08 21:37:13 +02:00
|
|
|
return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,
|
|
|
|
'default'))
|
2015-06-06 16:28:39 +02:00
|
|
|
except (isbnlib.ISBNLibException, TypeError):
|
2014-05-26 16:12:21 +02:00
|
|
|
return ''
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
|
2014-05-11 19:29:42 +02:00
|
|
|
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
|
|
|
|
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
|
|
|
|
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
|
2014-04-29 21:55:35 +02:00
|
|
|
clean_doi_re = re.compile('^/')
|
|
|
|
clean_doi_fabse_re = re.compile('^10.1096')
|
|
|
|
clean_doi_jcb_re = re.compile('^10.1083')
|
|
|
|
clean_doi_len_re = re.compile(r'\d\.\d')
|
2014-11-04 21:11:19 +01:00
|
|
|
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
2014-04-29 21:55:35 +02:00
|
|
|
|
|
|
|
|
2014-11-04 21:11:19 +01:00
|
|
|
def findArticleID(src, only=["DOI", "arXiv"]):
|
|
|
|
"""Search for a valid article ID (DOI or ArXiv) in src.
|
2014-04-28 22:23:05 +02:00
|
|
|
|
2014-11-04 21:11:19 +01:00
|
|
|
Returns a tuple (type, first matching ID) or False if not found
|
|
|
|
or an error occurred.
|
2014-04-28 22:23:05 +02:00
|
|
|
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
2014-11-04 21:11:19 +01:00
|
|
|
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
2014-04-28 22:23:05 +02:00
|
|
|
"""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
2014-05-02 00:33:09 +02:00
|
|
|
else:
|
2014-11-04 21:11:19 +01:00
|
|
|
return (False, False)
|
2014-04-30 00:36:15 +02:00
|
|
|
|
2014-05-26 16:12:21 +02:00
|
|
|
extractfull = ''
|
2014-11-04 21:11:19 +01:00
|
|
|
extract_type = False
|
|
|
|
extractID = None
|
2014-04-30 00:54:15 +02:00
|
|
|
while totext.poll() is None:
|
2014-08-03 21:20:48 +02:00
|
|
|
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
2014-11-04 21:11:19 +01:00
|
|
|
# Try to extract DOI
|
|
|
|
if "DOI" in only:
|
2015-12-07 03:39:57 +01:00
|
|
|
extractlower = extractfull.lower().replace('digital object identifier', 'doi')
|
|
|
|
extractID = doi_re.search(extractlower.replace('Œ', '-'))
|
2014-11-04 21:11:19 +01:00
|
|
|
if not extractID:
|
|
|
|
# PNAS fix
|
2015-12-07 03:39:57 +01:00
|
|
|
extractID = doi_pnas_re.search(extractlower.replace('pnas', '/pnas'))
|
2014-11-04 21:11:19 +01:00
|
|
|
if not extractID:
|
|
|
|
# JSB fix
|
2015-12-07 03:39:57 +01:00
|
|
|
extractID = doi_jsb_re.search(extractlower)
|
2014-11-04 21:11:19 +01:00
|
|
|
if extractID:
|
|
|
|
extract_type = "DOI"
|
|
|
|
totext.terminate()
|
|
|
|
# Try to extract arXiv
|
|
|
|
if "arXiv" in only:
|
|
|
|
tmp_extractID = arXiv_re.search(extractfull)
|
|
|
|
if tmp_extractID:
|
|
|
|
if not extractID or extractID.start(0) > tmp_extractID.start(1):
|
|
|
|
# Only use arXiv id if it is before the DOI in the pdf
|
|
|
|
extractID = tmp_extractID
|
|
|
|
extract_type = "arXiv"
|
|
|
|
totext.terminate()
|
|
|
|
if extract_type is not False:
|
2014-04-30 00:36:15 +02:00
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
2014-04-28 22:23:05 +02:00
|
|
|
# Error happened
|
2014-04-30 00:36:15 +02:00
|
|
|
tools.warning(err)
|
2014-11-04 21:11:19 +01:00
|
|
|
return (False, False)
|
2014-04-28 22:23:05 +02:00
|
|
|
|
2014-11-04 21:11:19 +01:00
|
|
|
if extractID is not None and extract_type == "DOI":
|
|
|
|
# If DOI extracted, clean it and return it
|
|
|
|
cleanDOI = False
|
|
|
|
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
|
2014-04-29 21:55:35 +02:00
|
|
|
if clean_doi_re.search(cleanDOI):
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanDOI = cleanDOI[1:]
|
|
|
|
# FABSE J fix
|
2014-04-29 21:55:35 +02:00
|
|
|
if clean_doi_fabse_re.search(cleanDOI):
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanDOI = cleanDOI[:20]
|
|
|
|
# Second JCB fix
|
2014-04-29 21:55:35 +02:00
|
|
|
if clean_doi_jcb_re.search(cleanDOI):
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanDOI = cleanDOI[:21]
|
|
|
|
if len(cleanDOI) > 40:
|
2014-04-29 21:55:35 +02:00
|
|
|
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
|
2014-04-28 22:23:05 +02:00
|
|
|
reps = {'.': 'A', '-': '0'}
|
|
|
|
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
|
|
|
digitStart = 0
|
|
|
|
for i in range(len(cleanDOItemp)):
|
|
|
|
if cleanDOItemp[i].isdigit():
|
|
|
|
digitStart = 1
|
|
|
|
if cleanDOItemp[i].isalpha() and digitStart:
|
|
|
|
break
|
|
|
|
cleanDOI = cleanDOI[0:(8+i)]
|
2014-11-04 21:11:19 +01:00
|
|
|
return ("DOI", cleanDOI)
|
|
|
|
elif extractID is not None and extract_type == "arXiv":
|
|
|
|
# If arXiv id is extracted, return it
|
|
|
|
return ("arXiv", extractID.group(1))
|
|
|
|
return (False, False)
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
def doi2Bib(doi):
|
2014-05-01 19:46:04 +02:00
|
|
|
"""Returns a bibTeX string of metadata for a given DOI.
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
From : https://gist.github.com/jrsmith3/5513926
|
|
|
|
"""
|
|
|
|
url = "http://dx.doi.org/" + doi
|
|
|
|
headers = {"accept": "application/x-bibtex"}
|
2014-08-03 00:17:01 +02:00
|
|
|
req = Request(url, headers=headers)
|
2014-04-28 22:23:05 +02:00
|
|
|
try:
|
2014-08-03 00:17:01 +02:00
|
|
|
r = urlopen(req)
|
2014-04-28 22:23:05 +02:00
|
|
|
|
2014-08-03 21:20:48 +02:00
|
|
|
try:
|
2014-08-03 22:59:00 +02:00
|
|
|
if dict(r.info())['content-type'] == 'application/x-bibtex':
|
2014-08-03 21:52:01 +02:00
|
|
|
return r.read().decode('utf-8')
|
2014-08-03 21:20:48 +02:00
|
|
|
else:
|
|
|
|
return ''
|
|
|
|
except KeyError:
|
2014-08-03 22:59:00 +02:00
|
|
|
try:
|
|
|
|
if dict(r.info())['Content-Type'] == 'application/x-bibtex':
|
|
|
|
return r.read().decode('utf-8')
|
|
|
|
else:
|
|
|
|
return ''
|
|
|
|
except KeyError:
|
|
|
|
return ''
|
2015-06-08 20:02:09 +02:00
|
|
|
except:
|
2014-04-28 22:23:05 +02:00
|
|
|
tools.warning('Unable to contact remote server to get the bibtex ' +
|
|
|
|
'entry for doi '+doi)
|
|
|
|
return ''
|
2014-05-01 19:46:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
def arXiv2Bib(arxiv):
|
|
|
|
"""Returns bibTeX string of metadata for a given arXiv id
|
|
|
|
|
|
|
|
arxiv is an arxiv id
|
|
|
|
"""
|
2014-05-02 00:07:49 +02:00
|
|
|
bibtex = arxiv_metadata.arxiv2bib([arxiv])
|
|
|
|
for bib in bibtex:
|
|
|
|
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
|
|
|
|
continue
|
|
|
|
else:
|
2014-12-03 12:54:24 +01:00
|
|
|
fetched_bibtex = bibtexparser.loads(bib.bibtex())
|
|
|
|
fetched_bibtex = fetched_bibtex.entries_dict
|
2014-08-03 21:20:48 +02:00
|
|
|
fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]
|
2014-05-11 19:29:42 +02:00
|
|
|
try:
|
|
|
|
del(fetched_bibtex['file'])
|
2014-07-01 20:31:19 +02:00
|
|
|
except KeyError:
|
2014-05-11 19:29:42 +02:00
|
|
|
pass
|
|
|
|
return tools.parsed2Bibtex(fetched_bibtex)
|
2014-05-26 16:12:21 +02:00
|
|
|
return ''
|
2014-05-02 00:33:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
|
|
|
|
|
|
|
|
|
|
|
|
def findHALId(src):
|
|
|
|
"""Searches for a valid HAL id in src
|
|
|
|
|
|
|
|
Returns a tuple of the HAL id and the version
|
|
|
|
or False if not found or an error occurred.
|
|
|
|
"""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
while totext.poll() is None:
|
2014-08-03 21:20:48 +02:00
|
|
|
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
2014-05-02 00:33:09 +02:00
|
|
|
extractID = HAL_re.search(extractfull)
|
|
|
|
if extractID:
|
|
|
|
totext.terminate()
|
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
|
|
|
# Error happened
|
|
|
|
tools.warning(err)
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return extractID.group(1), extractID.group(2)
|