Voice Activity Detection

About Voice Activity Detection

Voice activity detection (VAD) is a technology used to identify and distinguish between speech and non-speech segments in an audio signal. It plays a crucial role in various applications such as speech recognition, speaker identification, and audio coding. VAD algorithms analyze the characteristics of the audio signal, such as energy, spectral content, and pitch, to determine whether it contains speech or silence. By accurately detecting voice activity, VAD systems can efficiently process and analyze only the relevant speech segments, reducing computational complexity and improving overall system performance.

Switchboard Editor example

This example plays a recording of a conversation. The Switchboard VoiceActivityDetectorNode detects the speech segments and displays the detection state accordingly.

Code Example

JSON
Swift
Kotlin
C++
JavaScript

{
  "nodes": {
    { "id": "vadNode", "type": "VoiceActivityDetectorNode" },
    { "id": "splitterNode", "type": "BusSplitterNode" }
  },
  "connections": {
    { "sourceNode": "inputNode", "destinationNode": "splitterNode" },
    { "sourceNode": "splitterNode", "destinationNode": "vadNode" },
    { "sourceNode": "splitterNode", "destinationNode": "outputNode" }
  }
}

import SwitchboardSDK

class VoiceActivityDetectorExample {
    let audioGraph = SBAudioGraph()
    let vadNode = SBVoiceActivityDetectorNode()
    let splitterNode = SBBusSplitterNode()
    let audioEngine = SBAudioEngine()

    init() {
        audioEngine.microphoneEnabled = true

        audioGraph.addNode(vadNode)
        audioGraph.addNode(splitterNode)
        audioGraph.connect(audioGraph.inputNode, to: splitterNode)
        audioGraph.connect(splitterNode, to: vadNode)
        audioGraph.connect(splitterNode, to: audioGraph.outputNode)
    }

    func getVADStatus() {
        return vadNode.status
    }

    func startEngine() {
        audioEngine.start(audioGraph)
    }

    func stopEngine() {
        audioEngine.stop()
    }
}

import com.synervoz.switchboard.sdk.AudioEngine
import com.synervoz.switchboard.sdk.audiograph.AudioGraph
import com.synervoz.switchboard.sdk.audiographnodes.VoiceActivityDetectorNode
import com.synervoz.switchboard.sdk.audiographnodes.BusSplitterNode

class VoiceActivityDetectorExample(context: Context) {

	val audioEngine = AudioEngine(context)
    val audioGraph = AudioGraph()
    val vadNode = VoiceActivityDetectorNode()
    val splitterNode = BusSplitterNode()

	init {
        audioGraph.addNode(vadNode)
        audioGraph.addNode(splitterNode)
        audioGraph.connect(audioGraph.inputNode, splitterNode)
		audioGraph.connect(splitterNode, vadNode)
		audioGraph.connect(splitterNode, audioGraph.outputNode)

        audioEngine.start(audioGraph)
	}

    fun getVADStatus(): Int {
        return vadNode.status
    }

    fun close() {
        audioEngine.close()
        audioGraph.close()
        vadnode.close()
        splitterNode.close()
    }
}

#include "AudioGraph.hpp"
#include "BusSplitterNode.hpp"
#include "VoiceActivityDetectorNode.hpp"

using namespace switchboard;

class VoiceActivityDetectorExample {
public:
    VoiceActivityDetectorExample() {
        // Adding nodes to audio graph
        audioGraph.addNode(vadNode);
        audioGraph.addNode(splitterNode);

        // Connecting audio nodes
        audioGraph.connect(audioGraph.getInputNode(), splitterNode);
		audioGraph.connect(splitterNode, vadNode);
		audioGraph.connect(splitterNode, audioGraph.getOutputNode());

        // Starting the graph
        audioGraph.start();
    }

    // Example method called by the audio processing pipeline on each buffer
    bool process(float** buffers, const uint numberOfChannels, const uint numberOfFrames, const uint sampleRate) {
        AudioBuffer<float> inBuffer = AudioBuffer<float>(numberOfChannels, numberOfFrames, false, sampleRate, buffers);
        AudioBuffer<float> outBuffer = AudioBuffer<float>(numberOfChannels, numberOfFrames, false, sampleRate, buffers);
        const bool result = audioGraph->process(&inBuffer, &outBuffer);
        return result;
    }

private:
    AudioGraph audioGraph;
    VoiceActivityDetectorNode vadNode;
    BusSplitterNode splitterNode;
};

// Change the import to reflect the location of library
// relative to this publically accessible AudioWorklet

import SwitchboardSDK from '../../libs/switchboard-sdk/SwitchboardSDK.js'

class VoiceActivityDetectorProcessor extends AudioWorkletProcessor {
  constructor(options) {
    super()
    this.sampleRate = options.processorOptions.sampleRate
    this.port.onmessage = (event) => this.onMessage(event.data)
  }

  sendMessage(message, transfer = []) {
    this.port.postMessage(message, transfer)
  }

  onMessage(message) {
    if (message.command === 'requestUiDefinitions') {
      this.sendMessage({ uiDefinitions: uiDefinitions })
    } else if (message.command === 'requestDynamicValueUpdate') {
      this.updateDynamicValue(message.id)
    } else if (message.wasmArrayBuffer) {
      const switchboardSdkConfigObject = {
        extensions: [],
        wasmBytes: message.wasmArrayBuffer,
      }
      this.configure(switchboardSdkConfigObject)
    }
  }

  updateDynamicValue(id) {
    let value = this.getVADStatusString(this.vadNode.getStatus())
    this.sendMessage({ dynamicValueID: id, value: value })
  }

  getVADStatusString(value) {
    switch (value) {
      case 0:
        return 'Voice Detected'
      case 1:
        return 'Hangover'
      default:
        return 'Idle'
    }
  }

  configure(sdkConfig) {
    this.switchboard = new SwitchboardSDK()
    this.switchboard.configure(sdkConfig).then((response) => {
      this.constructAudioGraph()
    })
  }

  constructAudioGraph() {
    const inputChannelLayout = [2]
    const outputChannelLayout = []
    const maxNumFrames = 128

    let audioGraph = this.switchboard.createAudioGraph(
      inputChannelLayout,
      outputChannelLayout,
      maxNumFrames,
      this.sampleRate
    )

    let multiChannelToMonoNode =
      new this.switchboard.classes.MultiChannelToMonoNode()
    let vadNode = new this.switchboard.classes.VoiceActivityDetectorNode()

    let audioGraphInputNode = audioGraph.getInputNode()

    audioGraph.addNode(multiChannelToMonoNode)
    audioGraph.addNode(vadNode)

    audioGraph.connect(audioGraphInputNode, multiChannelToMonoNode)
    audioGraph.connect(multiChannelToMonoNode, vadNode)

    audioGraph.start()

    this.vadNode = vadNode
    this.multiChannelToMonoNode = multiChannelToMonoNode
    this.audioGraph = audioGraph
  }

  destruct() {
    this.audioGraph.destruct()
    this.vadNode.destruct()
    this.multiChannelToMonoNode.destruct()
  }

  process(inputs, outputs, parameters) {
    return this.audioGraph.processGraph(inputs, outputs)
  }
}

registerProcessor(
  'VoiceActivityDetectorProcessor',
  VoiceActivityDetectorProcessor
)

About Voice Activity Detection​

Switchboard Editor example​

Code Example​

About Voice Activity Detection

Switchboard Editor example

Code Example