voice_processing_audio_unit.mm

/*

 *  Copyright 2016 The WebRTC Project Authors. All rights reserved.

 *  Use of this source code is governed by a BSD-style license

 *  that can be found in the LICENSE file in the root of the source

 *  tree. An additional intellectual property rights grant can be found

 *  in the file PATENTS.  All contributing project authors may

 *  be found in the AUTHORS file in the root of the source tree.

*/

#import "voice_processing_audio_unit.h"

#include "rtc_base/checks.h"

#include "system_wrappers/include/metrics.h"

#import "base/RTCLogging.h"

#import "sdk/objc/components/audio/RTCAudioSessionConfiguration.h"

#if !defined(NDEBUG)

static void LogStreamDescription(AudioStreamBasicDescription description) {

  char formatIdString[5];

  UInt32 formatId = CFSwapInt32HostToBig(description.mFormatID);

  bcopy(&formatId, formatIdString, 4);

  formatIdString[4] = '\0';

  RTCLog(@"AudioStreamBasicDescription: {\n"

          "  mSampleRate: %.2f\n"

          "  formatIDString: %s\n"

          "  mFormatFlags: 0x%X\n"

          "  mBytesPerPacket: %u\n"

          "  mFramesPerPacket: %u\n"

          "  mBytesPerFrame: %u\n"

          "  mChannelsPerFrame: %u\n"

          "  mBitsPerChannel: %u\n"

          "  mReserved: %u\n}",

         description.mSampleRate, formatIdString,

         static_cast<unsigned int>(description.mFormatFlags),

         static_cast<unsigned int>(description.mBytesPerPacket),

         static_cast<unsigned int>(description.mFramesPerPacket),

         static_cast<unsigned int>(description.mBytesPerFrame),

         static_cast<unsigned int>(description.mChannelsPerFrame),

         static_cast<unsigned int>(description.mBitsPerChannel),

         static_cast<unsigned int>(description.mReserved));

#endif

namespace webrtc {

namespace ios_adm {

// Calls to AudioUnitInitialize() can fail if called back-to-back on different

// ADM instances. A fall-back solution is to allow multiple sequential calls

// with as small delay between each. This factor sets the max number of allowed

// initialization attempts.

static const int kMaxNumberOfAudioUnitInitializeAttempts = 5;

// A VP I/O unit's bus 1 connects to input hardware (microphone).

static const AudioUnitElement kInputBus = 1;

// A VP I/O unit's bus 0 connects to output hardware (speaker).

static const AudioUnitElement kOutputBus = 0;

// Returns the automatic gain control (AGC) state on the processed microphone

// signal. Should be on by default for Voice Processing audio units.

static OSStatus GetAGCState(AudioUnit audio_unit, UInt32* enabled) {

  RTC_DCHECK(audio_unit);

  UInt32 size = sizeof(*enabled);

  OSStatus result = AudioUnitGetProperty(audio_unit,

                                         kAUVoiceIOProperty_VoiceProcessingEnableAGC,

                                         kAudioUnitScope_Global,

                                         kInputBus,

                                         enabled,

                                         &size);

  RTCLog(@"VPIO unit AGC: %u", static_cast<unsigned int>(*enabled));

  return result;

VoiceProcessingAudioUnit::VoiceProcessingAudioUnit(bool bypass_voice_processing,

                                                   bool detect_mute_speech,

                                                   VoiceProcessingAudioUnitObserver* observer)

    : bypass_voice_processing_(bypass_voice_processing),

      detect_mute_speech_(detect_mute_speech),

      observer_(observer),

      vpio_unit_(nullptr),

      state_(kInitRequired) {

  RTC_DCHECK(observer);

VoiceProcessingAudioUnit::~VoiceProcessingAudioUnit() {

  DisposeAudioUnit();

const UInt32 VoiceProcessingAudioUnit::kBytesPerSample = 2;

bool VoiceProcessingAudioUnit::Init() {

  RTC_DCHECK_EQ(state_, kInitRequired);

  // Create an audio component description to identify the Voice Processing

  // I/O audio unit.

  AudioComponentDescription vpio_unit_description;

  vpio_unit_description.componentType = kAudioUnitType_Output;

  vpio_unit_description.componentSubType = kAudioUnitSubType_VoiceProcessingIO;

  vpio_unit_description.componentManufacturer = kAudioUnitManufacturer_Apple;

  vpio_unit_description.componentFlags = 0;

  vpio_unit_description.componentFlagsMask = 0;

  // Obtain an audio unit instance given the description.

  AudioComponent found_vpio_unit_ref =

      AudioComponentFindNext(nullptr, &vpio_unit_description);

  // Create a Voice Processing IO audio unit.

  OSStatus result = noErr;

  result = AudioComponentInstanceNew(found_vpio_unit_ref, &vpio_unit_);

  if (result != noErr) {

    vpio_unit_ = nullptr;

    RTCLogError(@"AudioComponentInstanceNew failed. Error=%ld.", (long)result);

    return false;

  // Enable input on the input scope of the input element.

  UInt32 enable_input = 1;

  result = AudioUnitSetProperty(vpio_unit_, kAudioOutputUnitProperty_EnableIO,

                                kAudioUnitScope_Input, kInputBus, &enable_input,

                                sizeof(enable_input));

  if (result != noErr) {

    DisposeAudioUnit();

    RTCLogError(@"Failed to enable input on input scope of input element. "

                 "Error=%ld.",

                (long)result);

    return false;

  // Enable output on the output scope of the output element.

  UInt32 enable_output = 1;

  result = AudioUnitSetProperty(vpio_unit_, kAudioOutputUnitProperty_EnableIO,

                                kAudioUnitScope_Output, kOutputBus,

                                &enable_output, sizeof(enable_output));

  if (result != noErr) {

    DisposeAudioUnit();

    RTCLogError(@"Failed to enable output on output scope of output element. "

                 "Error=%ld.",

                (long)result);

    return false;

  // Specify the callback function that provides audio samples to the audio

  // unit.

  AURenderCallbackStruct render_callback;

  render_callback.inputProc = OnGetPlayoutData;

  render_callback.inputProcRefCon = this;

  result = AudioUnitSetProperty(

      vpio_unit_, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input,

      kOutputBus, &render_callback, sizeof(render_callback));

  if (result != noErr) {

    DisposeAudioUnit();

    RTCLogError(@"Failed to specify the render callback on the output bus. "

                 "Error=%ld.",

                (long)result);

    return false;

  // Disable AU buffer allocation for the recorder, we allocate our own.

  // TODO(henrika): not sure that it actually saves resource to make this call.

  UInt32 flag = 0;

  result = AudioUnitSetProperty(

      vpio_unit_, kAudioUnitProperty_ShouldAllocateBuffer,

      kAudioUnitScope_Output, kInputBus, &flag, sizeof(flag));

  if (result != noErr) {

    DisposeAudioUnit();

    RTCLogError(@"Failed to disable buffer allocation on the input bus. "

                 "Error=%ld.",

                (long)result);

    return false;

  // Specify the callback to be called by the I/O thread to us when input audio

  // is available. The recorded samples can then be obtained by calling the

  // AudioUnitRender() method.

  AURenderCallbackStruct input_callback;

  input_callback.inputProc = OnDeliverRecordedData;

  input_callback.inputProcRefCon = this;

  result = AudioUnitSetProperty(vpio_unit_,

                                kAudioOutputUnitProperty_SetInputCallback,

                                kAudioUnitScope_Global, kInputBus,

                                &input_callback, sizeof(input_callback));

  if (result != noErr) {

    DisposeAudioUnit();

    RTCLogError(@"Failed to specify the input callback on the input bus. "

                 "Error=%ld.",

                (long)result);

    return false;

  state_ = kUninitialized;

  return true;

VoiceProcessingAudioUnit::State VoiceProcessingAudioUnit::GetState() const {

  return state_;

bool VoiceProcessingAudioUnit::Initialize(Float64 sample_rate) {

  RTC_DCHECK_GE(state_, kUninitialized);

  RTCLog(@"Initializing audio unit with sample rate: %f", sample_rate);

  OSStatus result = noErr;

  AudioStreamBasicDescription format = GetFormat(sample_rate);

  UInt32 size = sizeof(format);

#if !defined(NDEBUG)

  LogStreamDescription(format);

#endif

  // Set the format on the output scope of the input element/bus.

  result =

      AudioUnitSetProperty(vpio_unit_, kAudioUnitProperty_StreamFormat,

                           kAudioUnitScope_Output, kInputBus, &format, size);

  if (result != noErr) {

    RTCLogError(@"Failed to set format on output scope of input bus. "

                 "Error=%ld.",

                (long)result);

    return false;

  // Set the format on the input scope of the output element/bus.

  result =

      AudioUnitSetProperty(vpio_unit_, kAudioUnitProperty_StreamFormat,

                           kAudioUnitScope_Input, kOutputBus, &format, size);

  if (result != noErr) {

    RTCLogError(@"Failed to set format on input scope of output bus. "

                 "Error=%ld.",

                (long)result);

    return false;

  // Initialize the Voice Processing I/O unit instance.

  // Calls to AudioUnitInitialize() can fail if called back-to-back on

  // different ADM instances. The error message in this case is -66635 which is

  // undocumented. Tests have shown that calling AudioUnitInitialize a second

  // time, after a short sleep, avoids this issue.

  // See webrtc:5166 for details.

  int failed_initalize_attempts = 0;

  result = AudioUnitInitialize(vpio_unit_);

  while (result != noErr) {

    RTCLogError(@"Failed to initialize the Voice Processing I/O unit. "

                 "Error=%ld.",

                (long)result);

    ++failed_initalize_attempts;

    if (failed_initalize_attempts == kMaxNumberOfAudioUnitInitializeAttempts) {

      // Max number of initialization attempts exceeded, hence abort.

      RTCLogError(@"Too many initialization attempts.");

      return false;

    RTCLog(@"Pause 100ms and try audio unit initialization again...");

    [NSThread sleepForTimeInterval:0.1f];

    result = AudioUnitInitialize(vpio_unit_);

  if (result == noErr) {

    RTCLog(@"Voice Processing I/O unit is now initialized.");

  if (detect_mute_speech_) {

    if (@available(iOS 15, *)) {

      // Set listener for muted speech event.

      AUVoiceIOMutedSpeechActivityEventListener listener = ^(AUVoiceIOSpeechActivityEvent event) {

        observer_->OnReceivedMutedSpeechActivity(event);

};

      result = AudioUnitSetProperty(vpio_unit_,

                                    kAUVoiceIOProperty_MutedSpeechActivityEventListener,

                                    kAudioUnitScope_Global,

0,

                                    &listener,

                                    sizeof(AUVoiceIOMutedSpeechActivityEventListener));

      if (result != noErr) {

        RTCLog(@"Failed to set muted speech activity event listener. Error=%ld.", (long)result);

  if (bypass_voice_processing_) {

    // Attempt to disable builtin voice processing.

    UInt32 toggle = 1;

    result = AudioUnitSetProperty(vpio_unit_,

                                  kAUVoiceIOProperty_BypassVoiceProcessing,

                                  kAudioUnitScope_Global,

                                  kInputBus,

                                  &toggle,

                                  sizeof(toggle));

    if (result == noErr) {

      RTCLog(@"Successfully bypassed voice processing.");

    } else {

      RTCLogError(@"Failed to bypass voice processing. Error=%ld.", (long)result);

    state_ = kInitialized;

    return true;

  // AGC should be enabled by default for Voice Processing I/O units but it is

  // checked below and enabled explicitly if needed. This scheme is used

  // to be absolutely sure that the AGC is enabled since we have seen cases

  // where only zeros are recorded and a disabled AGC could be one of the

  // reasons why it happens.

  int agc_was_enabled_by_default = 0;

  UInt32 agc_is_enabled = 0;

  result = GetAGCState(vpio_unit_, &agc_is_enabled);

  if (result != noErr) {

    RTCLogError(@"Failed to get AGC state (1st attempt). "

                 "Error=%ld.",

                (long)result);

    // Example of error code: kAudioUnitErr_NoConnection (-10876).

    // All error codes related to audio units are negative and are therefore

    // converted into a postive value to match the UMA APIs.

    RTC_HISTOGRAM_COUNTS_SPARSE_100000(

        "WebRTC.Audio.GetAGCStateErrorCode1", (-1) * result);

  } else if (agc_is_enabled) {

    // Remember that the AGC was enabled by default. Will be used in UMA.

    agc_was_enabled_by_default = 1;

  } else {

    // AGC was initially disabled => try to enable it explicitly.

    UInt32 enable_agc = 1;

    result =

        AudioUnitSetProperty(vpio_unit_,

                             kAUVoiceIOProperty_VoiceProcessingEnableAGC,

                             kAudioUnitScope_Global, kInputBus, &enable_agc,

                             sizeof(enable_agc));

    if (result != noErr) {

      RTCLogError(@"Failed to enable the built-in AGC. "

                   "Error=%ld.",

                  (long)result);

      RTC_HISTOGRAM_COUNTS_SPARSE_100000(

          "WebRTC.Audio.SetAGCStateErrorCode", (-1) * result);

    result = GetAGCState(vpio_unit_, &agc_is_enabled);

    if (result != noErr) {

      RTCLogError(@"Failed to get AGC state (2nd attempt). "

                   "Error=%ld.",

                  (long)result);

      RTC_HISTOGRAM_COUNTS_SPARSE_100000(

          "WebRTC.Audio.GetAGCStateErrorCode2", (-1) * result);

  // Track if the built-in AGC was enabled by default (as it should) or not.

  RTC_HISTOGRAM_BOOLEAN("WebRTC.Audio.BuiltInAGCWasEnabledByDefault",

                        agc_was_enabled_by_default);

  RTCLog(@"WebRTC.Audio.BuiltInAGCWasEnabledByDefault: %d",

         agc_was_enabled_by_default);

  // As a final step, add an UMA histogram for tracking the AGC state.

  // At this stage, the AGC should be enabled, and if it is not, more work is

  // needed to find out the root cause.

  RTC_HISTOGRAM_BOOLEAN("WebRTC.Audio.BuiltInAGCIsEnabled", agc_is_enabled);

  RTCLog(@"WebRTC.Audio.BuiltInAGCIsEnabled: %u",

         static_cast<unsigned int>(agc_is_enabled));

  state_ = kInitialized;

  return true;

OSStatus VoiceProcessingAudioUnit::Start() {

  RTC_DCHECK_GE(state_, kUninitialized);

  RTCLog(@"Starting audio unit.");

  OSStatus result = AudioOutputUnitStart(vpio_unit_);

  if (result != noErr) {

    RTCLogError(@"Failed to start audio unit. Error=%ld", (long)result);

    return result;

  } else {

    RTCLog(@"Started audio unit");

  state_ = kStarted;

  return noErr;

bool VoiceProcessingAudioUnit::Stop() {

  RTC_DCHECK_GE(state_, kUninitialized);

  RTCLog(@"Stopping audio unit.");

  OSStatus result = AudioOutputUnitStop(vpio_unit_);

  if (result != noErr) {

    RTCLogError(@"Failed to stop audio unit. Error=%ld", (long)result);

    return false;

  } else {

    RTCLog(@"Stopped audio unit");

  state_ = kInitialized;

  return true;

bool VoiceProcessingAudioUnit::Uninitialize() {

  RTC_DCHECK_GE(state_, kUninitialized);

  RTCLog(@"Unintializing audio unit.");

  OSStatus result = AudioUnitUninitialize(vpio_unit_);

  if (result != noErr) {

    RTCLogError(@"Failed to uninitialize audio unit. Error=%ld", (long)result);

    return false;

  } else {

    RTCLog(@"Uninitialized audio unit.");

  state_ = kUninitialized;

  return true;

bool VoiceProcessingAudioUnit::SetMicrophoneMute(bool enable) {

  RTC_DCHECK_GE(state_, kUninitialized);

  RTCLog(@"Setting microphone %s.", enable ? "mute" : "unmute");

  OSStatus result = noErr;

  if (detect_mute_speech_) {

    UInt32 muteUplinkOutput = enable ? 1 : 0;

    result = AudioUnitSetProperty(vpio_unit_,

                                  kAUVoiceIOProperty_MuteOutput,

                                  kAudioUnitScope_Global,

                                  kInputBus,

                                  &muteUplinkOutput,

                                  sizeof(muteUplinkOutput));

  } else {

    UInt32 enableInput = enable ? 0 : 1;

    result = AudioUnitSetProperty(vpio_unit_,

                                  kAudioOutputUnitProperty_EnableIO,

                                  kAudioUnitScope_Input,

                                  kInputBus,

                                  &enableInput,

                                  sizeof(enableInput));

  if (result != noErr) {

    RTCLogError(@"Failed to %s microphone. Error=%ld", (enable ? "mute" : "unmute"), (long)result);

    return false;

  RTCLog(@"Set microphone %s.", enable ? "mute" : "unmute");

  return true;

OSStatus VoiceProcessingAudioUnit::Render(AudioUnitRenderActionFlags* flags,

                                          const AudioTimeStamp* time_stamp,

                                          UInt32 output_bus_number,

                                          UInt32 num_frames,

                                          AudioBufferList* io_data) {

  RTC_DCHECK(vpio_unit_) << "Init() not called.";

  OSStatus result = AudioUnitRender(vpio_unit_, flags, time_stamp,

                                    output_bus_number, num_frames, io_data);

  if (result != noErr) {

    RTCLogError(@"Failed to render audio unit. Error=%ld", (long)result);

  return result;

OSStatus VoiceProcessingAudioUnit::OnGetPlayoutData(

    void* in_ref_con,

    AudioUnitRenderActionFlags* flags,

    const AudioTimeStamp* time_stamp,

    UInt32 bus_number,

    UInt32 num_frames,

    AudioBufferList* io_data) {

  VoiceProcessingAudioUnit* audio_unit =

      static_cast<VoiceProcessingAudioUnit*>(in_ref_con);

  return audio_unit->NotifyGetPlayoutData(flags, time_stamp, bus_number,

                                          num_frames, io_data);

OSStatus VoiceProcessingAudioUnit::OnDeliverRecordedData(

    void* in_ref_con,

    AudioUnitRenderActionFlags* flags,

    const AudioTimeStamp* time_stamp,

    UInt32 bus_number,

    UInt32 num_frames,

    AudioBufferList* io_data) {

  VoiceProcessingAudioUnit* audio_unit =

      static_cast<VoiceProcessingAudioUnit*>(in_ref_con);

  return audio_unit->NotifyDeliverRecordedData(flags, time_stamp, bus_number,

                                               num_frames, io_data);

OSStatus VoiceProcessingAudioUnit::NotifyGetPlayoutData(

    AudioUnitRenderActionFlags* flags,

    const AudioTimeStamp* time_stamp,

    UInt32 bus_number,

    UInt32 num_frames,

    AudioBufferList* io_data) {

  return observer_->OnGetPlayoutData(flags, time_stamp, bus_number, num_frames,

                                     io_data);

OSStatus VoiceProcessingAudioUnit::NotifyDeliverRecordedData(

    AudioUnitRenderActionFlags* flags,

    const AudioTimeStamp* time_stamp,

    UInt32 bus_number,

    UInt32 num_frames,

    AudioBufferList* io_data) {

  return observer_->OnDeliverRecordedData(flags, time_stamp, bus_number,

                                          num_frames, io_data);

AudioStreamBasicDescription VoiceProcessingAudioUnit::GetFormat(

    Float64 sample_rate) const {

  // Set the application formats for input and output:

  // - use same format in both directions

  // - avoid resampling in the I/O unit by using the hardware sample rate

  // - linear PCM => noncompressed audio data format with one frame per packet

  // - no need to specify interleaving since only mono is supported

  AudioStreamBasicDescription format;

  RTC_DCHECK_EQ(1, kRTCAudioSessionPreferredNumberOfChannels);

  format.mSampleRate = sample_rate;

  format.mFormatID = kAudioFormatLinearPCM;

  format.mFormatFlags =

      kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;

  format.mBytesPerPacket = kBytesPerSample;

  format.mFramesPerPacket = 1;  // uncompressed.

  format.mBytesPerFrame = kBytesPerSample;

  format.mChannelsPerFrame = kRTCAudioSessionPreferredNumberOfChannels;

  format.mBitsPerChannel = 8 * kBytesPerSample;

  return format;

void VoiceProcessingAudioUnit::DisposeAudioUnit() {

  if (vpio_unit_) {

    switch (state_) {

      case kStarted:

        Stop();

        [[fallthrough]];

      case kInitialized:

        Uninitialize();

        break;

      case kUninitialized:

      case kInitRequired:

        break;

    RTCLog(@"Disposing audio unit.");

    OSStatus result = AudioComponentInstanceDispose(vpio_unit_);

    if (result != noErr) {

      RTCLogError(@"AudioComponentInstanceDispose failed. Error=%ld.",

                  (long)result);

    vpio_unit_ = nullptr;

}  // namespace ios_adm

}  // namespace webrtc

Source code

Revision control

Copy as Markdown

Other Tools