expresso-timer/vad.js

130 lines
3.7 KiB
JavaScript

'use strict';
// From https://github.com/johni0702/voice-activity-detection
var analyserFrequency = require('analyser-frequency-average');
module.exports = function(audioContext, stream, opts) {
opts = opts || {};
var defaults = {
fftSize: 1024,
bufferLen: 1024,
smoothingTimeConstant: 0.2,
minCaptureFreq: 85, // in Hz
maxCaptureFreq: 255, // in Hz
noiseCaptureDuration: 1000, // in ms
minNoiseLevel: 0.3, // from 0 to 1
maxNoiseLevel: 0.7, // from 0 to 1
avgNoiseMultiplier: 1.2,
onVoiceStart: function() {
},
onVoiceStop: function() {
},
onUpdate: function(val) {
}
};
var options = {};
for (var key in defaults) {
options[key] = opts.hasOwnProperty(key) ? opts[key] : defaults[key];
}
var baseLevel = 0;
var voiceScale = 1;
var activityCounter = 0;
var activityCounterMin = 0;
var activityCounterMax = 60;
var activityCounterThresh = 5;
var envFreqRange = [];
var isNoiseCapturing = true;
var prevVadState = undefined;
var vadState = false;
var captureTimeout = null;
var source = audioContext.createMediaStreamSource(stream);
var analyser = audioContext.createAnalyser();
analyser.smoothingTimeConstant = options.smoothingTimeConstant;
analyser.fftSize = options.fftSize;
var scriptProcessorNode = audioContext.createScriptProcessor(options.bufferLen, 1, 1);
connect();
scriptProcessorNode.onaudioprocess = monitor;
if (isNoiseCapturing) {
//console.log('VAD: start noise capturing');
captureTimeout = setTimeout(init, options.noiseCaptureDuration);
}
function init() {
//console.log('VAD: stop noise capturing');
isNoiseCapturing = false;
envFreqRange = envFreqRange.filter(function(val) {
return val;
}).sort();
var averageEnvFreq = envFreqRange.length ? envFreqRange.reduce(function (p, c) { return Math.min(p, c) }, 1) : (options.minNoiseLevel || 0.1);
baseLevel = averageEnvFreq * options.avgNoiseMultiplier;
if (options.minNoiseLevel && baseLevel < options.minNoiseLevel) baseLevel = options.minNoiseLevel;
if (options.maxNoiseLevel && baseLevel > options.maxNoiseLevel) baseLevel = options.maxNoiseLevel;
voiceScale = 1 - baseLevel;
//console.log('VAD: base level:', baseLevel);
}
function connect() {
source.connect(analyser);
analyser.connect(scriptProcessorNode);
scriptProcessorNode.connect(audioContext.destination);
}
function disconnect() {
scriptProcessorNode.disconnect();
}
function destroy() {
captureTimeout && clearTimeout(captureTimeout);
disconnect();
analyser.disconnect();
source.disconnect();
source.mediaStream.getTracks().forEach(track => track.stop());
}
function monitor() {
var frequencies = new Uint8Array(analyser.frequencyBinCount);
analyser.getByteFrequencyData(frequencies);
var average = analyserFrequency(analyser, frequencies, options.minCaptureFreq, options.maxCaptureFreq);
if (isNoiseCapturing) {
envFreqRange.push(average);
return;
}
if (average >= baseLevel && activityCounter < activityCounterMax) {
activityCounter++;
} else if (average < baseLevel && activityCounter > activityCounterMin) {
activityCounter--;
}
vadState = activityCounter > activityCounterThresh;
if (prevVadState !== vadState) {
vadState ? onVoiceStart() : onVoiceStop();
prevVadState = vadState;
}
options.onUpdate(Math.max(0, average - baseLevel) / voiceScale);
}
function onVoiceStart() {
options.onVoiceStart();
}
function onVoiceStop() {
options.onVoiceStop();
}
return {connect: connect, disconnect: disconnect, destroy: destroy};
};