bmc/libbmc/fetcher.py
2014-11-04 21:11:19 +01:00

357 lines
12 KiB
Python

# -*- coding: utf8 -*-
# -----------------------------------------------------------------------------
# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice
# you can do whatever you want with this stuff (and you can also do whatever
# you want with this stuff without retaining it, but that's not cool...). If we
# meet some day, and you think this stuff is worth it, you can buy me a
# <del>beer</del> soda in return.
# Phyks
# -----------------------------------------------------------------------------
import isbnlib
import re
import socket
import socks
import subprocess
import sys
try:
# For Python 3.0 and later
from urllib.request import urlopen, Request
from urllib.error import URLError
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen, Request, URLError
import arxiv2bib as arxiv_metadata
import libbmc.tools as tools
from bibtexparser.bparser import BibTexParser
from libbmc.config import Config
config = Config()
default_socket = socket.socket
try:
stdout_encoding = sys.stdout.encoding
assert(stdout_encoding is not None)
except (AttributeError, AssertionError):
stdout_encoding = 'UTF-8'
def download(url):
"""Download url tofile
Check that it is a valid pdf or djvu file. Tries all the
available proxies sequentially. Returns the raw content of the file, or
false if it could not be downloaded.
"""
for proxy in config.get("proxies"):
if proxy.startswith('socks'):
if proxy[5] == '4':
proxy_type = socks.SOCKS4
else:
proxy_type = socks.SOCKS5
proxy = proxy[proxy.find('://')+3:]
try:
proxy, port = proxy.split(':')
except ValueError:
port = None
socks.set_default_proxy(proxy_type, proxy, port)
socket.socket = socks.socksocket
elif proxy == '':
socket.socket = default_socket
else:
try:
proxy, port = proxy.split(':')
except ValueError:
port = None
socks.set_default_proxy(socks.HTTP, proxy, port)
socket.socket = socks.socksocket
try:
r = urlopen(url)
try:
size = int(dict(r.info())['content-length'].strip())
except KeyError:
try:
size = int(dict(r.info())['Content-Length'].strip())
except KeyError:
size = 0
dl = b""
dl_size = 0
while True:
buf = r.read(1024)
if buf:
dl += buf
dl_size += len(buf)
if size != 0:
done = int(50 * dl_size / size)
sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done)))
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
else:
break
contenttype = False
contenttype_req = None
try:
contenttype_req = dict(r.info())['content-type']
except KeyError:
try:
contenttype_req = dict(r.info())['Content-Type']
except KeyError:
continue
try:
if 'pdf' in contenttype_req:
contenttype = 'pdf'
elif 'djvu' in contenttype_req:
contenttype = 'djvu'
except KeyError:
pass
if r.getcode() != 200 or contenttype is False:
continue
return dl, contenttype
except ValueError:
tools.warning("Invalid URL")
return False, None
except URLError:
tools.warning("Unable to get "+url+" using proxy "+proxy+". It " +
"may not be available.")
continue
return False, None
isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
re.IGNORECASE)
def findISBN(src):
"""Search for a valid ISBN in src.
Returns the ISBN or false if not found or an error occurred."""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1)
else:
return False
while totext.poll() is None:
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractISBN = isbn_re.search(extractfull.lower().replace('&#338;',
'-'))
if extractISBN:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
cleanISBN = False
# Clean ISBN is the ISBN number without separators
if extractISBN:
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
return cleanISBN
def isbn2Bib(isbn):
"""Tries to get bibtex entry from an ISBN number"""
# Default merges results from worldcat.org and google books
try:
return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,
'default'))
except (isbnlib.ISBNLibException, isbnlib.ISBNToolsException, TypeError):
return ''
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d')
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
def findArticleID(src, only=["DOI", "arXiv"]):
"""Search for a valid article ID (DOI or ArXiv) in src.
Returns a tuple (type, first matching ID) or False if not found
or an error occurred.
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return (False, False)
extractfull = ''
extract_type = False
extractID = None
while totext.poll() is None:
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
# Try to extract DOI
if "DOI" in only:
extractID = doi_re.search(extractfull.lower().replace('&#338;', '-'))
if not extractID:
# PNAS fix
extractID = doi_pnas_re.search(extractfull.
lower().
replace('pnas', '/pnas'))
if not extractID:
# JSB fix
extractID = doi_jsb_re.search(extractfull.lower())
if extractID:
extract_type = "DOI"
totext.terminate()
# Try to extract arXiv
if "arXiv" in only:
tmp_extractID = arXiv_re.search(extractfull)
if tmp_extractID:
if not extractID or extractID.start(0) > tmp_extractID.start(1):
# Only use arXiv id if it is before the DOI in the pdf
extractID = tmp_extractID
extract_type = "arXiv"
totext.terminate()
if extract_type is not False:
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return (False, False)
if extractID is not None and extract_type == "DOI":
# If DOI extracted, clean it and return it
cleanDOI = False
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
if clean_doi_re.search(cleanDOI):
cleanDOI = cleanDOI[1:]
# FABSE J fix
if clean_doi_fabse_re.search(cleanDOI):
cleanDOI = cleanDOI[:20]
# Second JCB fix
if clean_doi_jcb_re.search(cleanDOI):
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
reps = {'.': 'A', '-': '0'}
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return ("DOI", cleanDOI)
elif extractID is not None and extract_type == "arXiv":
# If arXiv id is extracted, return it
return ("arXiv", extractID.group(1))
return (False, False)
def doi2Bib(doi):
"""Returns a bibTeX string of metadata for a given DOI.
From : https://gist.github.com/jrsmith3/5513926
"""
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
req = Request(url, headers=headers)
try:
r = urlopen(req)
try:
if dict(r.info())['content-type'] == 'application/x-bibtex':
return r.read().decode('utf-8')
else:
return ''
except KeyError:
try:
if dict(r.info())['Content-Type'] == 'application/x-bibtex':
return r.read().decode('utf-8')
else:
return ''
except KeyError:
return ''
except URLError:
tools.warning('Unable to contact remote server to get the bibtex ' +
'entry for doi '+doi)
return ''
def arXiv2Bib(arxiv):
"""Returns bibTeX string of metadata for a given arXiv id
arxiv is an arxiv id
"""
bibtex = arxiv_metadata.arxiv2bib([arxiv])
for bib in bibtex:
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
continue
else:
fetched_bibtex = BibTexParser(bib.bibtex())
fetched_bibtex = fetched_bibtex.get_entry_dict()
fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]
try:
del(fetched_bibtex['file'])
except KeyError:
pass
return tools.parsed2Bibtex(fetched_bibtex)
return ''
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
def findHALId(src):
"""Searches for a valid HAL id in src
Returns a tuple of the HAL id and the version
or False if not found or an error occurred.
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
while totext.poll() is None:
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractID = HAL_re.search(extractfull)
if extractID:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
else:
return extractID.group(1), extractID.group(2)