Voice Activity Detection
About Voice Activity Detection
Voice activity detection (VAD) is a technology used to identify and distinguish between speech and non-speech segments in an audio signal. It plays a crucial role in various applications such as speech recognition, speaker identification, and audio coding. VAD algorithms analyze the characteristics of the audio signal, such as energy, spectral content, and pitch, to determine whether it contains speech or silence. By accurately detecting voice activity, VAD systems can efficiently process and analyze only the relevant speech segments, reducing computational complexity and improving overall system performance.
Switchboard Editor example
This example plays a recording of a conversation. The Switchboard VoiceActivityDetectorNode detects the speech segments and displays the detection state accordingly.
Code Example
- JSON
- Swift
- Kotlin
- C++
- JavaScript
{
"nodes": {
{ "id": "vadNode", "type": "VoiceActivityDetectorNode" },
{ "id": "splitterNode", "type": "BusSplitterNode" }
},
"connections": {
{ "sourceNode": "inputNode", "destinationNode": "splitterNode" },
{ "sourceNode": "splitterNode", "destinationNode": "vadNode" },
{ "sourceNode": "splitterNode", "destinationNode": "outputNode" }
}
}
import SwitchboardSDK
class VoiceActivityDetectorExample {
let audioGraph = SBAudioGraph()
let vadNode = SBVoiceActivityDetectorNode()
let splitterNode = SBBusSplitterNode()
let audioEngine = SBAudioEngine()
init() {
audioEngine.microphoneEnabled = true
audioGraph.addNode(vadNode)
audioGraph.addNode(splitterNode)
audioGraph.connect(audioGraph.inputNode, to: splitterNode)
audioGraph.connect(splitterNode, to: vadNode)
audioGraph.connect(splitterNode, to: audioGraph.outputNode)
}
func getVADStatus() {
return vadNode.status
}
func startEngine() {
audioEngine.start(audioGraph)
}
func stopEngine() {
audioEngine.stop()
}
}
import com.synervoz.switchboard.sdk.AudioEngine
import com.synervoz.switchboard.sdk.audiograph.AudioGraph
import com.synervoz.switchboard.sdk.audiographnodes.VoiceActivityDetectorNode
import com.synervoz.switchboard.sdk.audiographnodes.BusSplitterNode
class VoiceActivityDetectorExample(context: Context) {
val audioEngine = AudioEngine(context)
val audioGraph = AudioGraph()
val vadNode = VoiceActivityDetectorNode()
val splitterNode = BusSplitterNode()
init {
audioGraph.addNode(vadNode)
audioGraph.addNode(splitterNode)
audioGraph.connect(audioGraph.inputNode, splitterNode)
audioGraph.connect(splitterNode, vadNode)
audioGraph.connect(splitterNode, audioGraph.outputNode)
audioEngine.start(audioGraph)
}
fun getVADStatus(): Int {
return vadNode.status
}
fun close() {
audioEngine.close()
audioGraph.close()
vadnode.close()
splitterNode.close()
}
}
#include "AudioGraph.hpp"
#include "BusSplitterNode.hpp"
#include "VoiceActivityDetectorNode.hpp"
using namespace switchboard;
class VoiceActivityDetectorExample {
public:
VoiceActivityDetectorExample() {
// Adding nodes to audio graph
audioGraph.addNode(vadNode);
audioGraph.addNode(splitterNode);
// Connecting audio nodes
audioGraph.connect(audioGraph.getInputNode(), splitterNode);
audioGraph.connect(splitterNode, vadNode);
audioGraph.connect(splitterNode, audioGraph.getOutputNode());
// Starting the graph
audioGraph.start();
}
// Example method called by the audio processing pipeline on each buffer
bool process(float** buffers, const uint numberOfChannels, const uint numberOfFrames, const uint sampleRate) {
AudioBuffer<float> inBuffer = AudioBuffer<float>(numberOfChannels, numberOfFrames, false, sampleRate, buffers);
AudioBuffer<float> outBuffer = AudioBuffer<float>(numberOfChannels, numberOfFrames, false, sampleRate, buffers);
const bool result = audioGraph->process(&inBuffer, &outBuffer);
return result;
}
private:
AudioGraph audioGraph;
VoiceActivityDetectorNode vadNode;
BusSplitterNode splitterNode;
};
// Change the import to reflect the location of library
// relative to this publically accessible AudioWorklet
import SwitchboardSDK from '../../libs/switchboard-sdk/SwitchboardSDK.js'
class VoiceActivityDetectorProcessor extends AudioWorkletProcessor {
constructor(options) {
super()
this.sampleRate = options.processorOptions.sampleRate
this.port.onmessage = (event) => this.onMessage(event.data)
}
sendMessage(message, transfer = []) {
this.port.postMessage(message, transfer)
}
onMessage(message) {
if (message.command === 'requestUiDefinitions') {
this.sendMessage({ uiDefinitions: uiDefinitions })
} else if (message.command === 'requestDynamicValueUpdate') {
this.updateDynamicValue(message.id)
} else if (message.wasmArrayBuffer) {
const switchboardSdkConfigObject = {
extensions: [],
wasmBytes: message.wasmArrayBuffer,
}
this.configure(switchboardSdkConfigObject)
}
}
updateDynamicValue(id) {
let value = this.getVADStatusString(this.vadNode.getStatus())
this.sendMessage({ dynamicValueID: id, value: value })
}
getVADStatusString(value) {
switch (value) {
case 0:
return 'Voice Detected'
case 1:
return 'Hangover'
default:
return 'Idle'
}
}
configure(sdkConfig) {
this.switchboard = new SwitchboardSDK()
this.switchboard.configure(sdkConfig).then((response) => {
this.constructAudioGraph()
})
}
constructAudioGraph() {
const inputChannelLayout = [2]
const outputChannelLayout = []
const maxNumFrames = 128
let audioGraph = this.switchboard.createAudioGraph(
inputChannelLayout,
outputChannelLayout,
maxNumFrames,
this.sampleRate
)
let multiChannelToMonoNode =
new this.switchboard.classes.MultiChannelToMonoNode()
let vadNode = new this.switchboard.classes.VoiceActivityDetectorNode()
let audioGraphInputNode = audioGraph.getInputNode()
audioGraph.addNode(multiChannelToMonoNode)
audioGraph.addNode(vadNode)
audioGraph.connect(audioGraphInputNode, multiChannelToMonoNode)
audioGraph.connect(multiChannelToMonoNode, vadNode)
audioGraph.start()
this.vadNode = vadNode
this.multiChannelToMonoNode = multiChannelToMonoNode
this.audioGraph = audioGraph
}
destruct() {
this.audioGraph.destruct()
this.vadNode.destruct()
this.multiChannelToMonoNode.destruct()
}
process(inputs, outputs, parameters) {
return this.audioGraph.processGraph(inputs, outputs)
}
}
registerProcessor(
'VoiceActivityDetectorProcessor',
VoiceActivityDetectorProcessor
)