voice_activity_detector.cc

Enable keyboard shortcuts

/*

 *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.

 *  Use of this source code is governed by a BSD-style license

 *  that can be found in the LICENSE file in the root of the source

 *  tree. An additional intellectual property rights grant can be found

 *  in the file PATENTS.  All contributing project authors may

 *  be found in the AUTHORS file in the root of the source tree.

*/

#include "modules/audio_processing/vad/voice_activity_detector.h"

#include <algorithm>

#include "rtc_base/checks.h"

namespace webrtc {

namespace {

const size_t kNumChannels = 1;

const double kDefaultVoiceValue = 1.0;

const double kNeutralProbability = 0.5;

const double kLowProbability = 0.01;

}  // namespace

VoiceActivityDetector::VoiceActivityDetector()

    : last_voice_probability_(kDefaultVoiceValue),

      standalone_vad_(StandaloneVad::Create()) {}

VoiceActivityDetector::~VoiceActivityDetector() = default;

// Because ISAC has a different chunk length, it updates

// `chunkwise_voice_probabilities_` and `chunkwise_rms_` when there is new data.

// Otherwise it clears them.

void VoiceActivityDetector::ProcessChunk(const int16_t* audio,

                                         size_t length,

                                         int sample_rate_hz) {

  RTC_DCHECK_EQ(length, sample_rate_hz / 100);

  // TODO(bugs.webrtc.org/7494): Remove resampling and force 16 kHz audio.

  // Resample to the required rate.

  const int16_t* resampled_ptr = audio;

  if (sample_rate_hz != kSampleRateHz) {

    RTC_CHECK_EQ(

        resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels),

0);

    resampler_.Push(audio, length, resampled_, kLength10Ms, length);

    resampled_ptr = resampled_;

  RTC_DCHECK_EQ(length, kLength10Ms);

  // Each chunk needs to be passed into `standalone_vad_`, because internally it

  // buffers the audio and processes it all at once when GetActivity() is

  // called.

  RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0);

  audio_processing_.ExtractFeatures(resampled_ptr, length, &features_);

  chunkwise_voice_probabilities_.resize(features_.num_frames);

  chunkwise_rms_.resize(features_.num_frames);

  std::copy(features_.rms, features_.rms + chunkwise_rms_.size(),

            chunkwise_rms_.begin());

  if (features_.num_frames > 0) {

    if (features_.silence) {

      // The other features are invalid, so set the voice probabilities to an

      // arbitrary low value.

      std::fill(chunkwise_voice_probabilities_.begin(),

                chunkwise_voice_probabilities_.end(), kLowProbability);

    } else {

      std::fill(chunkwise_voice_probabilities_.begin(),

                chunkwise_voice_probabilities_.end(), kNeutralProbability);

      RTC_CHECK_GE(

          standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0],

                                       chunkwise_voice_probabilities_.size()),

0);

      RTC_CHECK_GE(pitch_based_vad_.VoicingProbability(

                       features_, &chunkwise_voice_probabilities_[0]),

0);

    last_voice_probability_ = chunkwise_voice_probabilities_.back();

}  // namespace webrtc