2014-04-28 22:23:05 +02:00
|
|
|
#!/usr/bin/env python2
|
2013-05-11 16:10:48 +02:00
|
|
|
# coding=utf8
|
2014-04-26 11:52:19 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
import isbntools
|
|
|
|
import re
|
|
|
|
import requesocks as requests # Requesocks is requests with SOCKS support
|
|
|
|
import subprocess
|
2014-05-14 23:07:06 +02:00
|
|
|
import sys
|
2014-05-02 00:07:49 +02:00
|
|
|
import arxiv2bib as arxiv_metadata
|
2014-04-28 22:23:05 +02:00
|
|
|
import tools
|
2014-04-26 11:52:19 +02:00
|
|
|
import params
|
2014-05-11 19:29:42 +02:00
|
|
|
from bibtexparser.bparser import BibTexParser
|
|
|
|
from isbntools.dev.fmt import fmtbib
|
2013-01-22 02:11:12 +01:00
|
|
|
|
2014-04-26 18:43:25 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
def download(url):
|
|
|
|
"""Download url tofile
|
2014-04-26 18:43:25 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
Check that it is a valid pdf or djvu file. Tries all the
|
|
|
|
available proxies sequentially. Returns the raw content of the file, or
|
|
|
|
false if it could not be downloaded.
|
|
|
|
"""
|
2014-04-26 11:52:19 +02:00
|
|
|
for proxy in params.proxies:
|
|
|
|
r_proxy = {
|
|
|
|
"http": proxy,
|
|
|
|
"https": proxy,
|
|
|
|
}
|
2014-04-26 18:43:25 +02:00
|
|
|
try:
|
|
|
|
r = requests.get(url, proxies=r_proxy)
|
2014-05-14 23:07:06 +02:00
|
|
|
size = int(r.headers['Content-Length'].strip())
|
|
|
|
dl = ""
|
|
|
|
dl_size = 0
|
|
|
|
for buf in r.iter_content(1024):
|
|
|
|
if buf:
|
|
|
|
dl += buf
|
|
|
|
dl_size += len(buf)
|
|
|
|
done = int(50 * dl_size / size)
|
|
|
|
sys.stdout.write("\r[%s%s]"%('='*done,' '*(50-done)))
|
|
|
|
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
|
|
|
sys.stdout.flush()
|
2014-04-26 23:26:25 +02:00
|
|
|
contenttype = False
|
|
|
|
if 'pdf' in r.headers['content-type']:
|
|
|
|
contenttype = 'pdf'
|
|
|
|
elif 'djvu' in r.headers['content-type']:
|
|
|
|
contenttype = 'djvu'
|
2013-02-22 00:13:22 +01:00
|
|
|
|
2014-04-26 23:26:25 +02:00
|
|
|
if r.status_code != 200 or contenttype is False:
|
2014-04-26 18:43:25 +02:00
|
|
|
continue
|
2013-05-11 11:57:28 +02:00
|
|
|
|
2014-05-14 23:07:06 +02:00
|
|
|
return dl, contenttype
|
2014-04-28 23:35:06 +02:00
|
|
|
except requests.exceptions.RequestException:
|
2014-05-09 23:37:17 +02:00
|
|
|
tools.warning("Unable to get "+url+" using proxy "+proxy+". It " +
|
2014-04-28 23:35:06 +02:00
|
|
|
"may not be available.")
|
2014-04-26 18:43:25 +02:00
|
|
|
continue
|
2014-04-26 11:52:19 +02:00
|
|
|
return False
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
|
2014-05-11 19:29:42 +02:00
|
|
|
isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
|
|
|
|
re.IGNORECASE)
|
2014-04-29 21:55:35 +02:00
|
|
|
|
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
def findISBN(src):
|
|
|
|
"""Search for a valid ISBN in src.
|
|
|
|
|
|
|
|
Returns the ISBN or false if not found or an error occurred."""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
2014-04-30 00:36:15 +02:00
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
bufsize=1)
|
2014-04-28 22:23:05 +02:00
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
2014-04-30 00:36:15 +02:00
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
bufsize=1)
|
2014-05-02 00:33:09 +02:00
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2014-04-30 00:54:15 +02:00
|
|
|
while totext.poll() is None:
|
2014-05-02 00:33:09 +02:00
|
|
|
extractfull = totext.stdout.readline()
|
2014-04-30 00:54:15 +02:00
|
|
|
extractISBN = isbn_re.search(extractfull.lower().replace('Œ',
|
|
|
|
'-'))
|
2014-04-30 00:36:15 +02:00
|
|
|
if extractISBN:
|
|
|
|
totext.terminate()
|
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
2014-04-28 22:23:05 +02:00
|
|
|
# Error happened
|
2014-04-30 00:36:15 +02:00
|
|
|
tools.warning(err)
|
2014-04-28 22:23:05 +02:00
|
|
|
return False
|
2014-04-30 00:36:15 +02:00
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanISBN = False
|
|
|
|
# Clean ISBN is the ISBN number without separators
|
|
|
|
if extractISBN:
|
|
|
|
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
|
|
|
|
return cleanISBN
|
|
|
|
|
|
|
|
|
|
|
|
def isbn2Bib(isbn):
|
2014-05-01 19:46:04 +02:00
|
|
|
"""Tries to get bibtex entry from an ISBN number"""
|
2014-05-11 19:29:42 +02:00
|
|
|
# Default merges results from worldcat.org and google books
|
|
|
|
return fmtbib('bibtex', isbntools.meta(isbn, 'default'))
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
|
2014-05-11 19:29:42 +02:00
|
|
|
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
|
|
|
|
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
|
|
|
|
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
|
2014-04-29 21:55:35 +02:00
|
|
|
clean_doi_re = re.compile('^/')
|
|
|
|
clean_doi_fabse_re = re.compile('^10.1096')
|
|
|
|
clean_doi_jcb_re = re.compile('^10.1083')
|
|
|
|
clean_doi_len_re = re.compile(r'\d\.\d')
|
|
|
|
|
|
|
|
|
2014-04-28 22:23:05 +02:00
|
|
|
def findDOI(src):
|
|
|
|
"""Search for a valid DOI in src.
|
|
|
|
|
|
|
|
Returns the DOI or False if not found or an error occurred.
|
|
|
|
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
|
|
|
"""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
2014-05-02 00:33:09 +02:00
|
|
|
else:
|
|
|
|
return False
|
2014-04-30 00:36:15 +02:00
|
|
|
|
2014-04-30 00:54:15 +02:00
|
|
|
while totext.poll() is None:
|
2014-05-02 00:33:09 +02:00
|
|
|
extractfull = totext.stdout.readline()
|
2014-04-30 00:36:15 +02:00
|
|
|
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))
|
|
|
|
if not extractDOI:
|
|
|
|
# PNAS fix
|
|
|
|
extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas',
|
|
|
|
'/pnas'))
|
|
|
|
if not extractDOI:
|
|
|
|
# JSB fix
|
|
|
|
extractDOI = doi_jsb_re.search(extractfull.lower())
|
|
|
|
if extractDOI:
|
|
|
|
totext.terminate()
|
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
2014-04-28 22:23:05 +02:00
|
|
|
# Error happened
|
2014-04-30 00:36:15 +02:00
|
|
|
tools.warning(err)
|
2014-04-28 22:23:05 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
cleanDOI = False
|
|
|
|
if extractDOI:
|
|
|
|
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
2014-04-29 21:55:35 +02:00
|
|
|
if clean_doi_re.search(cleanDOI):
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanDOI = cleanDOI[1:]
|
|
|
|
# FABSE J fix
|
2014-04-29 21:55:35 +02:00
|
|
|
if clean_doi_fabse_re.search(cleanDOI):
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanDOI = cleanDOI[:20]
|
|
|
|
# Second JCB fix
|
2014-04-29 21:55:35 +02:00
|
|
|
if clean_doi_jcb_re.search(cleanDOI):
|
2014-04-28 22:23:05 +02:00
|
|
|
cleanDOI = cleanDOI[:21]
|
|
|
|
if len(cleanDOI) > 40:
|
2014-04-29 21:55:35 +02:00
|
|
|
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
|
2014-04-28 22:23:05 +02:00
|
|
|
reps = {'.': 'A', '-': '0'}
|
|
|
|
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
|
|
|
digitStart = 0
|
|
|
|
for i in range(len(cleanDOItemp)):
|
|
|
|
if cleanDOItemp[i].isdigit():
|
|
|
|
digitStart = 1
|
|
|
|
if cleanDOItemp[i].isalpha() and digitStart:
|
|
|
|
break
|
|
|
|
cleanDOI = cleanDOI[0:(8+i)]
|
|
|
|
return cleanDOI
|
|
|
|
|
|
|
|
|
|
|
|
def doi2Bib(doi):
|
2014-05-01 19:46:04 +02:00
|
|
|
"""Returns a bibTeX string of metadata for a given DOI.
|
2014-04-28 22:23:05 +02:00
|
|
|
|
|
|
|
From : https://gist.github.com/jrsmith3/5513926
|
|
|
|
"""
|
|
|
|
url = "http://dx.doi.org/" + doi
|
|
|
|
headers = {"accept": "application/x-bibtex"}
|
|
|
|
try:
|
|
|
|
r = requests.get(url, headers=headers)
|
|
|
|
|
|
|
|
if r.headers['content-type'] == 'application/x-bibtex':
|
|
|
|
return r.text
|
|
|
|
else:
|
|
|
|
return ''
|
|
|
|
except requests.exceptions.ConnectionError:
|
|
|
|
tools.warning('Unable to contact remote server to get the bibtex ' +
|
|
|
|
'entry for doi '+doi)
|
|
|
|
return ''
|
2014-05-01 19:46:04 +02:00
|
|
|
|
|
|
|
|
2014-05-11 19:29:42 +02:00
|
|
|
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
2014-05-01 19:46:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
def findArXivId(src):
|
2014-05-02 00:33:09 +02:00
|
|
|
"""Searches for a valid arXiv id in src.
|
2014-05-01 19:46:04 +02:00
|
|
|
|
|
|
|
Returns the arXiv id or False if not found or an error occurred.
|
|
|
|
From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
|
|
|
"""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
2014-05-02 00:33:09 +02:00
|
|
|
else:
|
|
|
|
return False
|
2014-05-01 19:46:04 +02:00
|
|
|
|
|
|
|
while totext.poll() is None:
|
2014-05-02 00:33:09 +02:00
|
|
|
extractfull = totext.stdout.readline()
|
2014-05-01 19:46:04 +02:00
|
|
|
extractID = arXiv_re.search(extractfull)
|
|
|
|
if extractID:
|
|
|
|
totext.terminate()
|
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
|
|
|
# Error happened
|
|
|
|
tools.warning(err)
|
|
|
|
return False
|
2014-05-11 19:29:42 +02:00
|
|
|
elif extractID is not None:
|
2014-05-02 00:33:09 +02:00
|
|
|
return extractID.group(1)
|
2014-05-11 19:29:42 +02:00
|
|
|
else:
|
|
|
|
return False
|
2014-05-01 19:46:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
def arXiv2Bib(arxiv):
|
|
|
|
"""Returns bibTeX string of metadata for a given arXiv id
|
|
|
|
|
|
|
|
arxiv is an arxiv id
|
|
|
|
"""
|
2014-05-02 00:07:49 +02:00
|
|
|
bibtex = arxiv_metadata.arxiv2bib([arxiv])
|
|
|
|
for bib in bibtex:
|
|
|
|
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
|
|
|
|
continue
|
|
|
|
else:
|
2014-05-11 19:29:42 +02:00
|
|
|
fetched_bibtex = BibTexParser(bib.bibtex())
|
|
|
|
fetched_bibtex = fetched_bibtex.get_entry_dict()
|
|
|
|
fetched_bibtex = fetched_bibtex[fetched_bibtex.keys()[0]]
|
|
|
|
try:
|
|
|
|
del(fetched_bibtex['file'])
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
return tools.parsed2Bibtex(fetched_bibtex)
|
2014-05-02 00:07:49 +02:00
|
|
|
return False
|
2014-05-02 00:33:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
|
|
|
|
|
|
|
|
|
|
|
|
def findHALId(src):
|
|
|
|
"""Searches for a valid HAL id in src
|
|
|
|
|
|
|
|
Returns a tuple of the HAL id and the version
|
|
|
|
or False if not found or an error occurred.
|
|
|
|
"""
|
|
|
|
if src.endswith(".pdf"):
|
|
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
elif src.endswith(".djvu"):
|
|
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
while totext.poll() is None:
|
|
|
|
extractfull = totext.stdout.readline()
|
|
|
|
extractID = HAL_re.search(extractfull)
|
|
|
|
if extractID:
|
|
|
|
totext.terminate()
|
|
|
|
break
|
|
|
|
|
|
|
|
err = totext.communicate()[1]
|
|
|
|
if totext.returncode > 0:
|
|
|
|
# Error happened
|
|
|
|
tools.warning(err)
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return extractID.group(1), extractID.group(2)
|