simulator.cc - mozsearch

/*

 *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.

 *  Use of this source code is governed by a BSD-style license

 *  that can be found in the LICENSE file in the root of the source

 *  tree. An additional intellectual property rights grant can be found

 *  in the file PATENTS.  All contributing project authors may

 *  be found in the AUTHORS file in the root of the source tree.

*/

#include "modules/audio_processing/test/conversational_speech/simulator.h"

#include <math.h>

#include <algorithm>

#include <memory>

#include <set>

#include <utility>

#include <vector>

#include "absl/strings/string_view.h"

#include "api/array_view.h"

#include "common_audio/include/audio_util.h"

#include "common_audio/wav_file.h"

#include "modules/audio_processing/test/conversational_speech/wavreader_interface.h"

#include "rtc_base/logging.h"

#include "rtc_base/numerics/safe_conversions.h"

#include "test/testsupport/file_utils.h"

namespace webrtc {

namespace test {

namespace {

using conversational_speech::MultiEndCall;

using conversational_speech::SpeakerOutputFilePaths;

using conversational_speech::WavReaderInterface;

// Combines output path and speaker names to define the output file paths for

// the near-end and far=end audio tracks.

std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>>

InitSpeakerOutputFilePaths(const std::set<std::string>& speaker_names,

                           absl::string_view output_path) {

  // Create map.

  auto speaker_output_file_paths_map =

      std::make_unique<std::map<std::string, SpeakerOutputFilePaths>>();

  // Add near-end and far-end output paths into the map.

  for (const auto& speaker_name : speaker_names) {

    const std::string near_end_path =

        test::JoinFilename(output_path, "s_" + speaker_name + "-near_end.wav");

    RTC_LOG(LS_VERBOSE) << "The near-end audio track will be created in "

                        << near_end_path << ".";

    const std::string far_end_path =

        test::JoinFilename(output_path, "s_" + speaker_name + "-far_end.wav");

    RTC_LOG(LS_VERBOSE) << "The far-end audio track will be created in "

                        << far_end_path << ".";

    // Add to map.

    speaker_output_file_paths_map->emplace(

        std::piecewise_construct, std::forward_as_tuple(speaker_name),

        std::forward_as_tuple(near_end_path, far_end_path));

  return speaker_output_file_paths_map;

// Class that provides one WavWriter for the near-end and one for the far-end

// output track of a speaker.

class SpeakerWavWriters {

 public:

  SpeakerWavWriters(const SpeakerOutputFilePaths& output_file_paths,

                    int sample_rate)

      : near_end_wav_writer_(output_file_paths.near_end, sample_rate, 1u),

        far_end_wav_writer_(output_file_paths.far_end, sample_rate, 1u) {}

  WavWriter* near_end_wav_writer() { return &near_end_wav_writer_; }

  WavWriter* far_end_wav_writer() { return &far_end_wav_writer_; }

 private:

  WavWriter near_end_wav_writer_;

  WavWriter far_end_wav_writer_;

};

// Initializes one WavWriter instance for each speaker and both the near-end and

// far-end output tracks.

std::unique_ptr<std::map<std::string, SpeakerWavWriters>>

InitSpeakersWavWriters(const std::map<std::string, SpeakerOutputFilePaths>&

                           speaker_output_file_paths,

                       int sample_rate) {

  // Create map.

  auto speaker_wav_writers_map =

      std::make_unique<std::map<std::string, SpeakerWavWriters>>();

  // Add SpeakerWavWriters instance into the map.

  for (auto it = speaker_output_file_paths.begin();

       it != speaker_output_file_paths.end(); ++it) {

    speaker_wav_writers_map->emplace(

        std::piecewise_construct, std::forward_as_tuple(it->first),

        std::forward_as_tuple(it->second, sample_rate));

  return speaker_wav_writers_map;

// Reads all the samples for each audio track.

std::unique_ptr<std::map<std::string, std::vector<int16_t>>> PreloadAudioTracks(

    const std::map<std::string, std::unique_ptr<WavReaderInterface>>&

        audiotrack_readers) {

  // Create map.

  auto audiotracks_map =

      std::make_unique<std::map<std::string, std::vector<int16_t>>>();

  // Add audio track vectors.

  for (auto it = audiotrack_readers.begin(); it != audiotrack_readers.end();

       ++it) {

    // Add map entry.

    audiotracks_map->emplace(std::piecewise_construct,

                             std::forward_as_tuple(it->first),

                             std::forward_as_tuple(it->second->NumSamples()));

    // Read samples.

    it->second->ReadInt16Samples(audiotracks_map->at(it->first));

  return audiotracks_map;

// Writes all the values in `source_samples` via `wav_writer`. If the number of

// previously written samples in `wav_writer` is less than `interval_begin`, it

// adds zeros as left padding. The padding corresponds to intervals during which

// a speaker is not active.

void PadLeftWriteChunk(rtc::ArrayView<const int16_t> source_samples,

                       size_t interval_begin,

                       WavWriter* wav_writer) {

  // Add left padding.

  RTC_CHECK(wav_writer);

  RTC_CHECK_GE(interval_begin, wav_writer->num_samples());

  size_t padding_size = interval_begin - wav_writer->num_samples();

  if (padding_size != 0) {

    const std::vector<int16_t> padding(padding_size, 0);

    wav_writer->WriteSamples(padding.data(), padding_size);

  // Write source samples.

  wav_writer->WriteSamples(source_samples.data(), source_samples.size());

// Appends zeros via `wav_writer`. The number of zeros is always non-negative

// and equal to the difference between the previously written samples and

// `pad_samples`.

void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {

  RTC_CHECK(wav_writer);

  RTC_CHECK_GE(pad_samples, wav_writer->num_samples());

  size_t padding_size = pad_samples - wav_writer->num_samples();

  if (padding_size != 0) {

    const std::vector<int16_t> padding(padding_size, 0);

    wav_writer->WriteSamples(padding.data(), padding_size);

void ScaleSignal(rtc::ArrayView<const int16_t> source_samples,

                 int gain,

                 rtc::ArrayView<int16_t> output_samples) {

  const float gain_linear = DbToRatio(gain);

  RTC_DCHECK_EQ(source_samples.size(), output_samples.size());

  std::transform(source_samples.begin(), source_samples.end(),

                 output_samples.begin(), [gain_linear](int16_t x) -> int16_t {

                   return rtc::saturated_cast<int16_t>(x * gain_linear);

});

}  // namespace

namespace conversational_speech {

std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(

    const MultiEndCall& multiend_call,

    absl::string_view output_path) {

  // Set output file paths and initialize wav writers.

  const auto& speaker_names = multiend_call.speaker_names();

  auto speaker_output_file_paths =

      InitSpeakerOutputFilePaths(speaker_names, output_path);

  auto speakers_wav_writers = InitSpeakersWavWriters(

      *speaker_output_file_paths, multiend_call.sample_rate());

  // Preload all the input audio tracks.

  const auto& audiotrack_readers = multiend_call.audiotrack_readers();

  auto audiotracks = PreloadAudioTracks(audiotrack_readers);

  // TODO(alessiob): When speaker_names.size() == 2, near-end and far-end

  // across the 2 speakers are symmetric; hence, the code below could be

  // replaced by only creating the near-end or the far-end. However, this would

  // require to split the unit tests and document the behavior in README.md.

  // In practice, it should not be an issue since the files are not expected to

  // be signinificant.

  // Write near-end and far-end output tracks.

  for (const auto& speaking_turn : multiend_call.speaking_turns()) {

    const std::string& active_speaker_name = speaking_turn.speaker_name;

    const auto source_audiotrack =

        audiotracks->at(speaking_turn.audiotrack_file_name);

    std::vector<int16_t> scaled_audiotrack(source_audiotrack.size());

    ScaleSignal(source_audiotrack, speaking_turn.gain, scaled_audiotrack);

    // Write active speaker's chunk to active speaker's near-end.

    PadLeftWriteChunk(

        scaled_audiotrack, speaking_turn.begin,

        speakers_wav_writers->at(active_speaker_name).near_end_wav_writer());

    // Write active speaker's chunk to other participants' far-ends.

    for (const std::string& speaker_name : speaker_names) {

      if (speaker_name == active_speaker_name)

        continue;

      PadLeftWriteChunk(

          scaled_audiotrack, speaking_turn.begin,

          speakers_wav_writers->at(speaker_name).far_end_wav_writer());

  // Finalize all the output tracks with right padding.

  // This is required to make all the output tracks duration equal.

  size_t duration_samples = multiend_call.total_duration_samples();

  for (const std::string& speaker_name : speaker_names) {

    PadRightWrite(speakers_wav_writers->at(speaker_name).near_end_wav_writer(),

                  duration_samples);

    PadRightWrite(speakers_wav_writers->at(speaker_name).far_end_wav_writer(),

                  duration_samples);

  return speaker_output_file_paths;

}  // namespace conversational_speech

}  // namespace test

}  // namespace webrtc

Source code

Revision control

Copy as Markdown

Other Tools