# -*- coding: utf8 -*- # ----------------------------------------------------------------------------- # "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42): # Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice # you can do whatever you want with this stuff (and you can also do whatever # you want with this stuff without retaining it, but that's not cool...). If we # meet some day, and you think this stuff is worth it, you can buy me a # beer soda in return. # Phyks # ----------------------------------------------------------------------------- import isbnlib import re import socket import socks import subprocess import sys try: # For Python 3.0 and later from urllib.request import urlopen, Request from urllib.error import URLError except ImportError: # Fall back to Python 2's urllib2 from urllib2 import urlopen, Request, URLError import arxiv2bib as arxiv_metadata import libbmc.tools as tools import bibtexparser from libbmc.config import Config config = Config() default_socket = socket.socket try: stdout_encoding = sys.stdout.encoding assert(stdout_encoding is not None) except (AttributeError, AssertionError): stdout_encoding = 'UTF-8' def download(url): """Download url tofile Check that it is a valid pdf or djvu file. Tries all the available proxies sequentially. Returns the raw content of the file, or false if it could not be downloaded. """ for proxy in config.get("proxies"): if proxy.startswith('socks'): if proxy[5] == '4': proxy_type = socks.SOCKS4 else: proxy_type = socks.SOCKS5 proxy = proxy[proxy.find('://')+3:] try: proxy, port = proxy.split(':') except ValueError: port = None socks.set_default_proxy(proxy_type, proxy, port) socket.socket = socks.socksocket elif proxy == '': socket.socket = default_socket else: try: proxy, port = proxy.split(':') except ValueError: port = None socks.set_default_proxy(socks.HTTP, proxy, port) socket.socket = socks.socksocket try: r = urlopen(url) try: size = int(dict(r.info())['content-length'].strip()) except KeyError: try: size = int(dict(r.info())['Content-Length'].strip()) except KeyError: size = 0 dl = b"" dl_size = 0 while True: buf = r.read(1024) if buf: dl += buf dl_size += len(buf) if size != 0: done = int(50 * dl_size / size) sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done))) sys.stdout.write(" "+str(int(float(done)/52*100))+"%") sys.stdout.flush() else: break contenttype = False contenttype_req = None try: contenttype_req = dict(r.info())['content-type'] except KeyError: try: contenttype_req = dict(r.info())['Content-Type'] except KeyError: continue try: if 'pdf' in contenttype_req: contenttype = 'pdf' elif 'djvu' in contenttype_req: contenttype = 'djvu' except KeyError: pass if r.getcode() != 200 or contenttype is False: continue return dl, contenttype except ValueError: tools.warning("Invalid URL") return False, None except (URLError, socket.error): if proxy != "": proxy_txt = "using proxy "+proxy else: proxy_txt = "without using any proxy" tools.warning("Unable to get "+url+" "+proxy_txt+". It " + "may not be available at the moment.") continue return False, None isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])', re.IGNORECASE) def findISBN(src): """Search for a valid ISBN in src. Returns the ISBN or false if not found or an error occurred.""" if src.endswith(".pdf"): totext = subprocess.Popen(["pdftotext", src, "-"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) elif src.endswith(".djvu"): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) else: return False while totext.poll() is None: extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()]) extractISBN = isbn_re.search(extractfull.lower().replace('Œ', '-')) if extractISBN: totext.terminate() break err = totext.communicate()[1] if totext.returncode > 0: # Error happened tools.warning(err) return False cleanISBN = False # Clean ISBN is the ISBN number without separators if extractISBN: cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '') return cleanISBN def isbn2Bib(isbn): """Tries to get bibtex entry from an ISBN number""" # Default merges results from worldcat.org and google books try: return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn, 'default')) except (isbnlib.ISBNLibException, TypeError): return '' doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE) doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE) doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE) clean_doi_re = re.compile('^/') clean_doi_fabse_re = re.compile('^10.1096') clean_doi_jcb_re = re.compile('^10.1083') clean_doi_len_re = re.compile(r'\d\.\d') arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE) def findArticleID(src, only=["DOI", "arXiv"]): """Search for a valid article ID (DOI or ArXiv) in src. Returns a tuple (type, first matching ID) or False if not found or an error occurred. From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb """ if src.endswith(".pdf"): totext = subprocess.Popen(["pdftotext", src, "-"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) elif src.endswith(".djvu"): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: return (False, False) extractfull = '' extract_type = False extractID = None while totext.poll() is None: extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()]) # Try to extract DOI if "DOI" in only: extractlower = extractfull.lower().replace('digital object identifier', 'doi') extractID = doi_re.search(extractlower.replace('Œ', '-')) if not extractID: # PNAS fix extractID = doi_pnas_re.search(extractlower.replace('pnas', '/pnas')) if not extractID: # JSB fix extractID = doi_jsb_re.search(extractlower) if extractID: extract_type = "DOI" totext.terminate() # Try to extract arXiv if "arXiv" in only: tmp_extractID = arXiv_re.search(extractfull) if tmp_extractID: if not extractID or extractID.start(0) > tmp_extractID.start(1): # Only use arXiv id if it is before the DOI in the pdf extractID = tmp_extractID extract_type = "arXiv" totext.terminate() if extract_type is not False: break err = totext.communicate()[1] if totext.returncode > 0: # Error happened tools.warning(err) return (False, False) if extractID is not None and extract_type == "DOI": # If DOI extracted, clean it and return it cleanDOI = False cleanDOI = extractID.group(0).replace(':', '').replace(' ', '') if clean_doi_re.search(cleanDOI): cleanDOI = cleanDOI[1:] # FABSE J fix if clean_doi_fabse_re.search(cleanDOI): cleanDOI = cleanDOI[:20] # Second JCB fix if clean_doi_jcb_re.search(cleanDOI): cleanDOI = cleanDOI[:21] if len(cleanDOI) > 40: cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI) reps = {'.': 'A', '-': '0'} cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps) digitStart = 0 for i in range(len(cleanDOItemp)): if cleanDOItemp[i].isdigit(): digitStart = 1 if cleanDOItemp[i].isalpha() and digitStart: break cleanDOI = cleanDOI[0:(8+i)] return ("DOI", cleanDOI) elif extractID is not None and extract_type == "arXiv": # If arXiv id is extracted, return it return ("arXiv", extractID.group(1)) return (False, False) def doi2Bib(doi): """Returns a bibTeX string of metadata for a given DOI. From : https://gist.github.com/jrsmith3/5513926 """ url = "http://dx.doi.org/" + doi headers = {"accept": "application/x-bibtex"} req = Request(url, headers=headers) try: r = urlopen(req) try: if dict(r.info())['content-type'] == 'application/x-bibtex': return r.read().decode('utf-8') else: return '' except KeyError: try: if dict(r.info())['Content-Type'] == 'application/x-bibtex': return r.read().decode('utf-8') else: return '' except KeyError: return '' except: tools.warning('Unable to contact remote server to get the bibtex ' + 'entry for doi '+doi) return '' def arXiv2Bib(arxiv): """Returns bibTeX string of metadata for a given arXiv id arxiv is an arxiv id """ bibtex = arxiv_metadata.arxiv2bib([arxiv]) for bib in bibtex: if isinstance(bib, arxiv_metadata.ReferenceErrorInfo): continue else: fetched_bibtex = bibtexparser.loads(bib.bibtex()) fetched_bibtex = fetched_bibtex.entries_dict fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]] try: del(fetched_bibtex['file']) except KeyError: pass return tools.parsed2Bibtex(fetched_bibtex) return '' HAL_re = re.compile(r'(hal-\d{8}), version (\d+)') def findHALId(src): """Searches for a valid HAL id in src Returns a tuple of the HAL id and the version or False if not found or an error occurred. """ if src.endswith(".pdf"): totext = subprocess.Popen(["pdftotext", src, "-"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) elif src.endswith(".djvu"): totext = subprocess.Popen(["djvutxt", src], stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: return False while totext.poll() is None: extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()]) extractID = HAL_re.search(extractfull) if extractID: totext.terminate() break err = totext.communicate()[1] if totext.returncode > 0: # Error happened tools.warning(err) return False else: return extractID.group(1), extractID.group(2)