330c2f2b5f
If the paper identifier is marked with Digital Object Identifier, but one or more of its references has a DOI link in it, then the reference DOI is taken as the paper one. This change replaces the words Digital Object Identifier with DOI in the text being searched to pull out the correct ID.
360 lines
12 KiB
Python
360 lines
12 KiB
Python
# -*- coding: utf8 -*-
|
|
# -----------------------------------------------------------------------------
|
|
# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
|
|
# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice
|
|
# you can do whatever you want with this stuff (and you can also do whatever
|
|
# you want with this stuff without retaining it, but that's not cool...). If we
|
|
# meet some day, and you think this stuff is worth it, you can buy me a
|
|
# <del>beer</del> soda in return.
|
|
# Phyks
|
|
# -----------------------------------------------------------------------------
|
|
|
|
|
|
import isbnlib
|
|
import re
|
|
import socket
|
|
import socks
|
|
import subprocess
|
|
import sys
|
|
try:
|
|
# For Python 3.0 and later
|
|
from urllib.request import urlopen, Request
|
|
from urllib.error import URLError
|
|
except ImportError:
|
|
# Fall back to Python 2's urllib2
|
|
from urllib2 import urlopen, Request, URLError
|
|
import arxiv2bib as arxiv_metadata
|
|
import libbmc.tools as tools
|
|
import bibtexparser
|
|
from libbmc.config import Config
|
|
|
|
|
|
config = Config()
|
|
default_socket = socket.socket
|
|
try:
|
|
stdout_encoding = sys.stdout.encoding
|
|
assert(stdout_encoding is not None)
|
|
except (AttributeError, AssertionError):
|
|
stdout_encoding = 'UTF-8'
|
|
|
|
|
|
def download(url):
|
|
"""Download url tofile
|
|
|
|
Check that it is a valid pdf or djvu file. Tries all the
|
|
available proxies sequentially. Returns the raw content of the file, or
|
|
false if it could not be downloaded.
|
|
"""
|
|
for proxy in config.get("proxies"):
|
|
if proxy.startswith('socks'):
|
|
if proxy[5] == '4':
|
|
proxy_type = socks.SOCKS4
|
|
else:
|
|
proxy_type = socks.SOCKS5
|
|
proxy = proxy[proxy.find('://')+3:]
|
|
try:
|
|
proxy, port = proxy.split(':')
|
|
except ValueError:
|
|
port = None
|
|
socks.set_default_proxy(proxy_type, proxy, port)
|
|
socket.socket = socks.socksocket
|
|
elif proxy == '':
|
|
socket.socket = default_socket
|
|
else:
|
|
try:
|
|
proxy, port = proxy.split(':')
|
|
except ValueError:
|
|
port = None
|
|
socks.set_default_proxy(socks.HTTP, proxy, port)
|
|
socket.socket = socks.socksocket
|
|
try:
|
|
r = urlopen(url)
|
|
try:
|
|
size = int(dict(r.info())['content-length'].strip())
|
|
except KeyError:
|
|
try:
|
|
size = int(dict(r.info())['Content-Length'].strip())
|
|
except KeyError:
|
|
size = 0
|
|
dl = b""
|
|
dl_size = 0
|
|
while True:
|
|
buf = r.read(1024)
|
|
if buf:
|
|
dl += buf
|
|
dl_size += len(buf)
|
|
if size != 0:
|
|
done = int(50 * dl_size / size)
|
|
sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done)))
|
|
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
|
sys.stdout.flush()
|
|
else:
|
|
break
|
|
contenttype = False
|
|
contenttype_req = None
|
|
try:
|
|
contenttype_req = dict(r.info())['content-type']
|
|
except KeyError:
|
|
try:
|
|
contenttype_req = dict(r.info())['Content-Type']
|
|
except KeyError:
|
|
continue
|
|
try:
|
|
if 'pdf' in contenttype_req:
|
|
contenttype = 'pdf'
|
|
elif 'djvu' in contenttype_req:
|
|
contenttype = 'djvu'
|
|
except KeyError:
|
|
pass
|
|
|
|
if r.getcode() != 200 or contenttype is False:
|
|
continue
|
|
|
|
return dl, contenttype
|
|
except ValueError:
|
|
tools.warning("Invalid URL")
|
|
return False, None
|
|
except (URLError, socket.error):
|
|
if proxy != "":
|
|
proxy_txt = "using proxy "+proxy
|
|
else:
|
|
proxy_txt = "without using any proxy"
|
|
tools.warning("Unable to get "+url+" "+proxy_txt+". It " +
|
|
"may not be available at the moment.")
|
|
continue
|
|
return False, None
|
|
|
|
|
|
isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
|
|
re.IGNORECASE)
|
|
|
|
|
|
def findISBN(src):
|
|
"""Search for a valid ISBN in src.
|
|
|
|
Returns the ISBN or false if not found or an error occurred."""
|
|
if src.endswith(".pdf"):
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
bufsize=1)
|
|
elif src.endswith(".djvu"):
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
bufsize=1)
|
|
else:
|
|
return False
|
|
|
|
while totext.poll() is None:
|
|
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
|
extractISBN = isbn_re.search(extractfull.lower().replace('Œ',
|
|
'-'))
|
|
if extractISBN:
|
|
totext.terminate()
|
|
break
|
|
|
|
err = totext.communicate()[1]
|
|
if totext.returncode > 0:
|
|
# Error happened
|
|
tools.warning(err)
|
|
return False
|
|
|
|
cleanISBN = False
|
|
# Clean ISBN is the ISBN number without separators
|
|
if extractISBN:
|
|
cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
|
|
return cleanISBN
|
|
|
|
|
|
def isbn2Bib(isbn):
|
|
"""Tries to get bibtex entry from an ISBN number"""
|
|
# Default merges results from worldcat.org and google books
|
|
try:
|
|
return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,
|
|
'default'))
|
|
except (isbnlib.ISBNLibException, TypeError):
|
|
return ''
|
|
|
|
|
|
doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
|
|
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
|
|
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
|
|
clean_doi_re = re.compile('^/')
|
|
clean_doi_fabse_re = re.compile('^10.1096')
|
|
clean_doi_jcb_re = re.compile('^10.1083')
|
|
clean_doi_len_re = re.compile(r'\d\.\d')
|
|
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
|
|
|
|
|
def findArticleID(src, only=["DOI", "arXiv"]):
|
|
"""Search for a valid article ID (DOI or ArXiv) in src.
|
|
|
|
Returns a tuple (type, first matching ID) or False if not found
|
|
or an error occurred.
|
|
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
|
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
|
"""
|
|
if src.endswith(".pdf"):
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
elif src.endswith(".djvu"):
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
else:
|
|
return (False, False)
|
|
|
|
extractfull = ''
|
|
extract_type = False
|
|
extractID = None
|
|
while totext.poll() is None:
|
|
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
|
# Try to extract DOI
|
|
if "DOI" in only:
|
|
extractlower = extractfull.lower().replace('digital object identifier', 'doi')
|
|
extractID = doi_re.search(extractlower.replace('Œ', '-'))
|
|
if not extractID:
|
|
# PNAS fix
|
|
extractID = doi_pnas_re.search(extractlower.replace('pnas', '/pnas'))
|
|
if not extractID:
|
|
# JSB fix
|
|
extractID = doi_jsb_re.search(extractlower)
|
|
if extractID:
|
|
extract_type = "DOI"
|
|
totext.terminate()
|
|
# Try to extract arXiv
|
|
if "arXiv" in only:
|
|
tmp_extractID = arXiv_re.search(extractfull)
|
|
if tmp_extractID:
|
|
if not extractID or extractID.start(0) > tmp_extractID.start(1):
|
|
# Only use arXiv id if it is before the DOI in the pdf
|
|
extractID = tmp_extractID
|
|
extract_type = "arXiv"
|
|
totext.terminate()
|
|
if extract_type is not False:
|
|
break
|
|
|
|
err = totext.communicate()[1]
|
|
if totext.returncode > 0:
|
|
# Error happened
|
|
tools.warning(err)
|
|
return (False, False)
|
|
|
|
if extractID is not None and extract_type == "DOI":
|
|
# If DOI extracted, clean it and return it
|
|
cleanDOI = False
|
|
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
|
|
if clean_doi_re.search(cleanDOI):
|
|
cleanDOI = cleanDOI[1:]
|
|
# FABSE J fix
|
|
if clean_doi_fabse_re.search(cleanDOI):
|
|
cleanDOI = cleanDOI[:20]
|
|
# Second JCB fix
|
|
if clean_doi_jcb_re.search(cleanDOI):
|
|
cleanDOI = cleanDOI[:21]
|
|
if len(cleanDOI) > 40:
|
|
cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
|
|
reps = {'.': 'A', '-': '0'}
|
|
cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
|
|
digitStart = 0
|
|
for i in range(len(cleanDOItemp)):
|
|
if cleanDOItemp[i].isdigit():
|
|
digitStart = 1
|
|
if cleanDOItemp[i].isalpha() and digitStart:
|
|
break
|
|
cleanDOI = cleanDOI[0:(8+i)]
|
|
return ("DOI", cleanDOI)
|
|
elif extractID is not None and extract_type == "arXiv":
|
|
# If arXiv id is extracted, return it
|
|
return ("arXiv", extractID.group(1))
|
|
return (False, False)
|
|
|
|
|
|
def doi2Bib(doi):
|
|
"""Returns a bibTeX string of metadata for a given DOI.
|
|
|
|
From : https://gist.github.com/jrsmith3/5513926
|
|
"""
|
|
url = "http://dx.doi.org/" + doi
|
|
headers = {"accept": "application/x-bibtex"}
|
|
req = Request(url, headers=headers)
|
|
try:
|
|
r = urlopen(req)
|
|
|
|
try:
|
|
if dict(r.info())['content-type'] == 'application/x-bibtex':
|
|
return r.read().decode('utf-8')
|
|
else:
|
|
return ''
|
|
except KeyError:
|
|
try:
|
|
if dict(r.info())['Content-Type'] == 'application/x-bibtex':
|
|
return r.read().decode('utf-8')
|
|
else:
|
|
return ''
|
|
except KeyError:
|
|
return ''
|
|
except:
|
|
tools.warning('Unable to contact remote server to get the bibtex ' +
|
|
'entry for doi '+doi)
|
|
return ''
|
|
|
|
|
|
def arXiv2Bib(arxiv):
|
|
"""Returns bibTeX string of metadata for a given arXiv id
|
|
|
|
arxiv is an arxiv id
|
|
"""
|
|
bibtex = arxiv_metadata.arxiv2bib([arxiv])
|
|
for bib in bibtex:
|
|
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
|
|
continue
|
|
else:
|
|
fetched_bibtex = bibtexparser.loads(bib.bibtex())
|
|
fetched_bibtex = fetched_bibtex.entries_dict
|
|
fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]
|
|
try:
|
|
del(fetched_bibtex['file'])
|
|
except KeyError:
|
|
pass
|
|
return tools.parsed2Bibtex(fetched_bibtex)
|
|
return ''
|
|
|
|
|
|
HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')
|
|
|
|
|
|
def findHALId(src):
|
|
"""Searches for a valid HAL id in src
|
|
|
|
Returns a tuple of the HAL id and the version
|
|
or False if not found or an error occurred.
|
|
"""
|
|
if src.endswith(".pdf"):
|
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
elif src.endswith(".djvu"):
|
|
totext = subprocess.Popen(["djvutxt", src],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
else:
|
|
return False
|
|
|
|
while totext.poll() is None:
|
|
extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
|
extractID = HAL_re.search(extractfull)
|
|
if extractID:
|
|
totext.terminate()
|
|
break
|
|
|
|
err = totext.communicate()[1]
|
|
if totext.returncode > 0:
|
|
# Error happened
|
|
tools.warning(err)
|
|
return False
|
|
else:
|
|
return extractID.group(1), extractID.group(2)
|