SpeechRecognition.cpp

firefox-main/dom/media/webspeech/recognition/SpeechRecognition.cpp (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "SpeechRecognition.h"

#include <algorithm>

#include "AudioSegment.h"

#include "MediaEnginePrefs.h"

#include "SpeechTrackListener.h"

#include "VideoUtils.h"

#include "endpointer.h"

#include "mozilla/AbstractThread.h"

#include "mozilla/MediaManager.h"

#include "mozilla/Preferences.h"

#include "mozilla/ResultVariant.h"

#include "mozilla/Services.h"

#include "mozilla/StaticPrefs_media.h"

#include "mozilla/dom/AudioStreamTrack.h"

#include "mozilla/dom/BindingUtils.h"

#include "mozilla/dom/Document.h"

#include "mozilla/dom/Element.h"

#include "mozilla/dom/MediaStreamError.h"

#include "mozilla/dom/MediaStreamTrackBinding.h"

#include "mozilla/dom/RootedDictionary.h"

#include "mozilla/dom/SpeechGrammar.h"

#include "mozilla/dom/SpeechRecognitionBinding.h"

#include "mozilla/dom/SpeechRecognitionEvent.h"

#include "nsCOMPtr.h"

#include "nsComponentManagerUtils.h"

#include "nsContentUtils.h"

#include "nsCycleCollectionParticipant.h"

#include "nsGlobalWindowInner.h"

#include "nsIObserverService.h"

#include "nsIPermissionManager.h"

#include "nsIPrincipal.h"

#include "nsPIDOMWindow.h"

#include "nsQueryObject.h"

#include "nsServiceManagerUtils.h"

// Undo the windows.h damage

#if defined(XP_WIN) && defined(GetMessage)

#  undef GetMessage

#endif

namespace mozilla::dom {

#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"

#define DEFAULT_RECOGNITION_SERVICE "online"

#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"

#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \

  "media.webspeech.long_silence_length"

#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \

  "media.webspeech.long_speech_length"

#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \

  "media.webspeech.recognition.timeout"

static const uint32_t kSAMPLE_RATE = 16000;

// number of frames corresponding to 300ms of audio to send to endpointer while

// it's in environment estimation mode

// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms

static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;

LogModule* GetSpeechRecognitionLog() {

  static LazyLogModule sLog("SpeechRecognition");

  return sLog;

#define SR_LOG(...) \

  MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))

namespace {

class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker {

 public:

  SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition,

                                   const nsString& aName)

      : media::ShutdownBlocker(aName), mRecognition(aRecognition) {}

  NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override {

    MOZ_ASSERT(NS_IsMainThread());

    // AbortSilently will eventually clear the blocker.

    mRecognition->Abort();

    return NS_OK;

 private:

  const RefPtr<SpeechRecognition> mRecognition;

};

enum class ServiceCreationError {

  ServiceNotFound,

};

Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError>

CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow,

                               SpeechRecognition* aRecognition,

                               const nsAString& aLang) {

  nsAutoCString speechRecognitionServiceCID;

  nsAutoCString prefValue;

  Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue);

  nsAutoCString speechRecognitionService;

  if (!prefValue.IsEmpty()) {

    speechRecognitionService = prefValue;

  } else {

    speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;

  if (StaticPrefs::media_webspeech_test_fake_recognition_service()) {

    speechRecognitionServiceCID =

        NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";

  } else {

    speechRecognitionServiceCID =

        nsLiteralCString(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +

        speechRecognitionService;

  nsresult rv;

  nsCOMPtr<nsISpeechRecognitionService> recognitionService;

  recognitionService =

      do_CreateInstance(speechRecognitionServiceCID.get(), &rv);

  if (!recognitionService) {

    return Err(ServiceCreationError::ServiceNotFound);

  return recognitionService;

}  // namespace

NS_IMPL_CYCLE_COLLECTION_WEAK_PTR_INHERITED(SpeechRecognition,

                                            DOMEventTargetHelper, mStream,

                                            mTrack, mRecognitionService,

                                            mSpeechGrammarList, mListener)

NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition)

  NS_INTERFACE_MAP_ENTRY(nsIObserver)

NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)

NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)

NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)

NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition::TrackListener,

                                   DOMMediaStream::TrackListener,

                                   mSpeechRecognition)

NS_IMPL_ADDREF_INHERITED(SpeechRecognition::TrackListener,

                         DOMMediaStream::TrackListener)

NS_IMPL_RELEASE_INHERITED(SpeechRecognition::TrackListener,

                          DOMMediaStream::TrackListener)

NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition::TrackListener)

NS_INTERFACE_MAP_END_INHERITING(DOMMediaStream::TrackListener)

SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)

    : DOMEventTargetHelper(aOwnerWindow),

      mEndpointer(kSAMPLE_RATE),

      mAudioSamplesPerChunk(mEndpointer.FrameSize()),

      mSpeechDetectionTimer(NS_NewTimer()),

      mSpeechGrammarList(new SpeechGrammarList(GetOwnerGlobal())),

      mContinuous(false),

      mInterimResults(false),

      mMaxAlternatives(1) {

  SR_LOG("created SpeechRecognition");

  if (StaticPrefs::media_webspeech_test_enable()) {

    nsCOMPtr<nsIObserverService> obs = services::GetObserverService();

    obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);

    obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);

  mEndpointer.set_speech_input_complete_silence_length(

      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000));

  mEndpointer.set_long_speech_input_complete_silence_length(

      Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));

  mEndpointer.set_long_speech_length(

      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));

  mSpeechDetectionTimeoutMs =

      Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000);

  Reset();

SpeechRecognition::~SpeechRecognition() = default;

bool SpeechRecognition::StateBetween(FSMState begin, FSMState end) {

  return mCurrentState >= begin && mCurrentState <= end;

void SpeechRecognition::SetState(FSMState state) {

  mCurrentState = state;

  SR_LOG("Transitioned to state %s", GetName(mCurrentState));

JSObject* SpeechRecognition::WrapObject(JSContext* aCx,

                                        JS::Handle<JSObject*> aGivenProto) {

  return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto);

already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor(

    const GlobalObject& aGlobal, ErrorResult& aRv) {

  nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports());

  if (!win) {

    aRv.Throw(NS_ERROR_FAILURE);

    return nullptr;

  RefPtr<SpeechRecognition> object = new SpeechRecognition(win);

  return object.forget();

void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) {

  SR_LOG("Processing %s, current state is %s", GetName(aEvent),

         GetName(mCurrentState));

  if (mAborted && aEvent->mType != EVENT_ABORT) {

    // ignore all events while aborting

    return;

  Transition(aEvent);

void SpeechRecognition::Transition(SpeechEvent* aEvent) {

  switch (mCurrentState) {

    case STATE_IDLE:

      switch (aEvent->mType) {

        case EVENT_START:

          // TODO: may want to time out if we wait too long

          // for user to approve

          WaitForAudioData(aEvent);

          break;

        case EVENT_STOP:

        case EVENT_ABORT:

        case EVENT_AUDIO_DATA:

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

          DoNothing(aEvent);

          break;

        case EVENT_AUDIO_ERROR:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          AbortError(aEvent);

          break;

        default:

          MOZ_CRASH("Invalid event");

      break;

    case STATE_STARTING:

      switch (aEvent->mType) {

        case EVENT_AUDIO_DATA:

          StartedAudioCapture(aEvent);

          break;

        case EVENT_AUDIO_ERROR:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          AbortError(aEvent);

          break;

        case EVENT_ABORT:

          AbortSilently(aEvent);

          break;

        case EVENT_STOP:

          ResetAndEnd();

          break;

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

          DoNothing(aEvent);

          break;

        case EVENT_START:

          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));

          MOZ_CRASH();

        default:

          MOZ_CRASH("Invalid event");

      break;

    case STATE_ESTIMATING:

      switch (aEvent->mType) {

        case EVENT_AUDIO_DATA:

          WaitForEstimation(aEvent);

          break;

        case EVENT_STOP:

          StopRecordingAndRecognize(aEvent);

          break;

        case EVENT_ABORT:

          AbortSilently(aEvent);

          break;

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          DoNothing(aEvent);

          break;

        case EVENT_AUDIO_ERROR:

          AbortError(aEvent);

          break;

        case EVENT_START:

          SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);

          MOZ_CRASH();

        default:

          MOZ_CRASH("Invalid event");

      break;

    case STATE_WAITING_FOR_SPEECH:

      switch (aEvent->mType) {

        case EVENT_AUDIO_DATA:

          DetectSpeech(aEvent);

          break;

        case EVENT_STOP:

          StopRecordingAndRecognize(aEvent);

          break;

        case EVENT_ABORT:

          AbortSilently(aEvent);

          break;

        case EVENT_AUDIO_ERROR:

          AbortError(aEvent);

          break;

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          DoNothing(aEvent);

          break;

        case EVENT_START:

          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));

          MOZ_CRASH();

        default:

          MOZ_CRASH("Invalid event");

      break;

    case STATE_RECOGNIZING:

      switch (aEvent->mType) {

        case EVENT_AUDIO_DATA:

          WaitForSpeechEnd(aEvent);

          break;

        case EVENT_STOP:

          StopRecordingAndRecognize(aEvent);

          break;

        case EVENT_AUDIO_ERROR:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          AbortError(aEvent);

          break;

        case EVENT_ABORT:

          AbortSilently(aEvent);

          break;

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

          DoNothing(aEvent);

          break;

        case EVENT_START:

          SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));

          MOZ_CRASH();

        default:

          MOZ_CRASH("Invalid event");

      break;

    case STATE_WAITING_FOR_RESULT:

      switch (aEvent->mType) {

        case EVENT_STOP:

          DoNothing(aEvent);

          break;

        case EVENT_AUDIO_ERROR:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          AbortError(aEvent);

          break;

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

          NotifyFinalResult(aEvent);

          break;

        case EVENT_AUDIO_DATA:

          DoNothing(aEvent);

          break;

        case EVENT_ABORT:

          AbortSilently(aEvent);

          break;

        case EVENT_START:

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

          SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s",

                 GetName(aEvent));

          MOZ_CRASH();

        default:

          MOZ_CRASH("Invalid event");

      break;

    case STATE_ABORTING:

      switch (aEvent->mType) {

        case EVENT_STOP:

        case EVENT_ABORT:

        case EVENT_AUDIO_DATA:

        case EVENT_AUDIO_ERROR:

        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

        case EVENT_RECOGNITIONSERVICE_ERROR:

          DoNothing(aEvent);

          break;

        case EVENT_START:

          SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent));

          MOZ_CRASH();

        default:

          MOZ_CRASH("Invalid event");

      break;

    default:

      MOZ_CRASH("Invalid state");

/*

 * Handle a segment of recorded audio data.

 * Returns the number of samples that were processed.

*/

uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,

                                                TrackRate aTrackRate) {

  AudioSegment::ChunkIterator iterator(*aSegment);

  uint32_t samples = 0;

  while (!iterator.IsEnded()) {

    float out;

    mEndpointer.ProcessAudio(*iterator, &out);

    samples += iterator->GetDuration();

    iterator.Next();

  // we need to call the nsISpeechRecognitionService::ProcessAudioSegment

  // in a separate thread so that any eventual encoding or pre-processing

  // of the audio does not block the main thread

  nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction(

      "nsISpeechRecognitionService::ProcessAudioSegment",

      [=, service = mRecognitionService,

       segment = std::move(*aSegment)]() mutable {

        service->ProcessAudioSegment(&segment, aTrackRate);

      }));

  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));

  (void)rv;

  return samples;

/****************************************************************************

 * FSM Transition functions

 * If a transition function may cause a DOM event to be fired,

 * it may also be re-entered, since the event handler may cause the

 * event loop to spin and new SpeechEvents to be processed.

 * Rules:

 * 1) These methods should call SetState as soon as possible.

 * 2) If these methods dispatch DOM events, or call methods that dispatch

 * DOM events, that should be done as late as possible.

 * 3) If anything must happen after dispatching a DOM event, make sure

 * the state is still what the method expected it to be.

 ****************************************************************************/

void SpeechRecognition::Reset() {

  SetState(STATE_IDLE);

  // This breaks potential ref-cycles.

  mRecognitionService = nullptr;

  ++mStreamGeneration;

  if (mStream) {

    mStream->UnregisterTrackListener(mListener);

    mStream = nullptr;

    mListener = nullptr;

  mTrack = nullptr;

  mTrackIsOwned = false;

  mStopRecordingPromise = nullptr;

  mEncodeTaskQueue = nullptr;

  mEstimationSamples = 0;

  mBufferedSamples = 0;

  mSpeechDetectionTimer->Cancel();

  mAborted = false;

void SpeechRecognition::ResetAndEnd() {

  Reset();

  DispatchTrustedEvent(u"end"_ns);

void SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) {

  SetState(STATE_STARTING);

void SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) {

  SetState(STATE_ESTIMATING);

  mEndpointer.SetEnvironmentEstimationMode();

  mEstimationSamples +=

      ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);

  DispatchTrustedEvent(u"audiostart"_ns);

  if (mCurrentState == STATE_ESTIMATING) {

    DispatchTrustedEvent(u"start"_ns);

void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) {

  SetState(STATE_WAITING_FOR_RESULT);

  MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");

  // This will run SoundEnd on the service just before StopRecording begins

  // shutting the encode thread down.

  mSpeechListener->mRemovedPromise->Then(

      GetCurrentSerialEventTarget(), __func__,

      [service = mRecognitionService] { service->SoundEnd(); });

  StopRecording();

void SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) {

  SetState(STATE_ESTIMATING);

  mEstimationSamples +=

      ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);

  if (mEstimationSamples > kESTIMATION_SAMPLES) {

    mEndpointer.SetUserInputMode();

    SetState(STATE_WAITING_FOR_SPEECH);

void SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) {

  SetState(STATE_WAITING_FOR_SPEECH);

  ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);

  if (mEndpointer.DidStartReceivingSpeech()) {

    mSpeechDetectionTimer->Cancel();

    SetState(STATE_RECOGNIZING);

    DispatchTrustedEvent(u"speechstart"_ns);

void SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) {

  SetState(STATE_RECOGNIZING);

  ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);

  if (mEndpointer.speech_input_complete()) {

    DispatchTrustedEvent(u"speechend"_ns);

    if (mCurrentState == STATE_RECOGNIZING) {

      // FIXME: StopRecordingAndRecognize should only be called for single

      // shot services for continuous we should just inform the service

      StopRecordingAndRecognize(aEvent);

void SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) {

  ResetAndEnd();

  RootedDictionary<SpeechRecognitionEventInit> init(RootingCx());

  init.mBubbles = true;

  init.mCancelable = false;

  // init.mResultIndex = 0;

  init.mResults = aEvent->mRecognitionResultList;

  init.mInterpretation = JS::NullValue();

  // init.mEmma = nullptr;

  RefPtr<SpeechRecognitionEvent> event =

      SpeechRecognitionEvent::Constructor(this, u"result"_ns, init);

  event->SetTrusted(true);

  DispatchEvent(*event);

void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {}

void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) {

  if (mRecognitionService) {

    if (mTrack) {

      // This will run Abort on the service just before StopRecording begins

      // shutting the encode thread down.

      mSpeechListener->mRemovedPromise->Then(

          GetCurrentSerialEventTarget(), __func__,

          [service = mRecognitionService] { service->Abort(); });

    } else {

      // Recording hasn't started yet. We can just call Abort().

      mRecognitionService->Abort();

  StopRecording()->Then(

      GetCurrentSerialEventTarget(), __func__,

      [self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); });

  SetState(STATE_ABORTING);

void SpeechRecognition::AbortError(SpeechEvent* aEvent) {

  AbortSilently(aEvent);

  NotifyError(aEvent);

void SpeechRecognition::NotifyError(SpeechEvent* aEvent) {

  aEvent->mError->SetTrusted(true);

  DispatchEvent(*aEvent->mError);

/**************************************

 * Event triggers and other functions *

 **************************************/

NS_IMETHODIMP

SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) {

  // hold a reference so that the underlying track doesn't get collected.

  mTrack = aTrack;

  MOZ_ASSERT(!mTrack->Ended());

  mSpeechListener = SpeechTrackListener::Create(this);

  mTrack->AddListener(mSpeechListener);

  nsString blockerName;

  blockerName.AppendPrintf("SpeechRecognition %p shutdown", this);

  mShutdownBlocker =

      MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName);

  media::MustGetShutdownBarrier()->AddBlocker(

      mShutdownBlocker, NS_LITERAL_STRING_FROM_CSTRING(__FILE__), __LINE__,

      u"SpeechRecognition shutdown"_ns);

  mEndpointer.StartSession();

  return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs,

                                     nsITimer::TYPE_ONE_SHOT);

RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() {

  if (!mTrack) {

    // Recording wasn't started, or has already been stopped.

    if (mStream) {

      // Ensure we don't start recording because a track became available

      // before we get reset.

      mStream->UnregisterTrackListener(mListener);

      mListener = nullptr;

    return GenericNonExclusivePromise::CreateAndResolve(true, __func__);

  if (mStopRecordingPromise) {

    return mStopRecordingPromise;

  mTrack->RemoveListener(mSpeechListener);

  if (mTrackIsOwned) {

    mTrack->Stop();

  mEndpointer.EndSession();

  DispatchTrustedEvent(u"audioend"_ns);

  // Block shutdown until the speech track listener has been removed from the

  // MSG, as it holds a reference to us, and we reference the world, which we

  // don't want to leak.

  mStopRecordingPromise =

      mSpeechListener->mRemovedPromise

          ->Then(

              GetCurrentSerialEventTarget(), __func__,

              [self = RefPtr<SpeechRecognition>(this), this] {

                SR_LOG("Shutting down encoding thread");

                return mEncodeTaskQueue->BeginShutdown();

},

              [] {

                MOZ_CRASH("Unexpected rejection");

                return ShutdownPromise::CreateAndResolve(false, __func__);

})

          ->Then(

              GetCurrentSerialEventTarget(), __func__,

              [self = RefPtr<SpeechRecognition>(this), this] {

                media::MustGetShutdownBarrier()->RemoveBlocker(

                    mShutdownBlocker);

                mShutdownBlocker = nullptr;

                MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE);

                return GenericNonExclusivePromise::CreateAndResolve(true,

                                                                    __func__);

},

              [] {

                MOZ_CRASH("Unexpected rejection");

                return GenericNonExclusivePromise::CreateAndResolve(false,

                                                                    __func__);

});

  return mStopRecordingPromise;

NS_IMETHODIMP

SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,

                           const char16_t* aData) {

  MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");

  if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&

      StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {

    DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,

                  SpeechRecognitionErrorCode::No_speech,

                  "No speech detected (timeout)");

  } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {

    nsCOMPtr<nsIObserverService> obs = services::GetObserverService();

    obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);

    obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);

  } else if (StaticPrefs::media_webspeech_test_fake_fsm_events() &&

             !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {

    ProcessTestEventRequest(aSubject, nsDependentString(aData));

  return NS_OK;

void SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject,

                                                const nsAString& aEventName) {

  if (aEventName.EqualsLiteral("EVENT_ABORT")) {

    Abort();

  } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {

    DispatchError(

        SpeechRecognition::EVENT_AUDIO_ERROR,

        SpeechRecognitionErrorCode::Audio_capture,  // TODO different codes?

        "AUDIO_ERROR test event");

  } else {

    NS_ASSERTION(StaticPrefs::media_webspeech_test_fake_recognition_service(),

                 "Got request for fake recognition service event, but "

                 "media.webspeech.test.fake_recognition_service is unset");

    // let the fake recognition service handle the request

already_AddRefed<SpeechGrammarList> SpeechRecognition::Grammars() const {

  RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList;

  return speechGrammarList.forget();

void SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) {

  mSpeechGrammarList = &aArg;

void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; }

void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; }

bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const {

  return mContinuous;

void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) {

  mContinuous = aArg;

bool SpeechRecognition::InterimResults() const { return mInterimResults; }

void SpeechRecognition::SetInterimResults(bool aArg) { mInterimResults = aArg; }

uint32_t SpeechRecognition::MaxAlternatives() const { return mMaxAlternatives; }

void SpeechRecognition::SetMaxAlternatives(uint32_t aArg) {

  mMaxAlternatives = aArg;

void SpeechRecognition::GetServiceURI(nsString& aRetVal,

                                      ErrorResult& aRv) const {

  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

void SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) {

  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,

                              CallerType aCallerType, ErrorResult& aRv) {

  if (mCurrentState != STATE_IDLE) {

    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

    return;

  if (!SetRecognitionService(aRv)) {

    return;

  if (!ValidateAndSetGrammarList(aRv)) {

    return;

  mEncodeTaskQueue =

      TaskQueue::Create(GetMediaThreadPool(MediaThreadType::WEBRTC_WORKER),

                        "WebSpeechEncoderThread");

  nsresult rv;

  rv = mRecognitionService->Initialize(this);

  if (NS_WARN_IF(NS_FAILED(rv))) {

    return;

  MediaStreamConstraints constraints;

  constraints.mAudio.SetAsBoolean() = true;

  MOZ_ASSERT(!mListener);

  mListener = new TrackListener(this);

  if (aStream.WasPassed()) {

    mStream = &aStream.Value();

    mTrackIsOwned = false;

    mStream->RegisterTrackListener(mListener);

    nsTArray<RefPtr<AudioStreamTrack>> tracks;

    mStream->GetAudioTracks(tracks);

    for (const RefPtr<AudioStreamTrack>& track : tracks) {

      if (!track->Ended()) {

        NotifyTrackAdded(track);

        break;

  } else {

    mTrackIsOwned = true;

    nsPIDOMWindowInner* win = GetOwnerWindow();

    if (!win || !win->IsFullyActive()) {

      aRv.ThrowInvalidStateError("The document is not fully active.");

      return;

    AutoNoJSAPI nojsapi;

    RefPtr<SpeechRecognition> self(this);

    MediaManager::Get()

        ->GetUserMedia(win, constraints, aCallerType)

        ->Then(

            GetCurrentSerialEventTarget(), __func__,

            [this, self,

             generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) {

              nsTArray<RefPtr<AudioStreamTrack>> tracks;

              aStream->GetAudioTracks(tracks);

              if (mAborted || mCurrentState != STATE_STARTING ||

                  mStreamGeneration != generation) {

                // We were probably aborted. Exit early.

                for (const RefPtr<AudioStreamTrack>& track : tracks) {

                  track->Stop();

                return;

              mStream = std::move(aStream);

              mStream->RegisterTrackListener(mListener);

              for (const RefPtr<AudioStreamTrack>& track : tracks) {

                if (!track->Ended()) {

                  NotifyTrackAdded(track);

},

            [this, self,

             generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) {

              if (mAborted || mCurrentState != STATE_STARTING ||

                  mStreamGeneration != generation) {

                // We were probably aborted. Exit early.

                return;

              SpeechRecognitionErrorCode errorCode;

              if (error->mName == MediaMgrError::Name::NotAllowedError) {

                errorCode = SpeechRecognitionErrorCode::Not_allowed;

              } else {

                errorCode = SpeechRecognitionErrorCode::Audio_capture;

              DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,

                            error->mMessage);

});

  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);

  NS_DispatchToMainThread(event);

bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) {

  if (!GetOwnerWindow()) {

    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

    return false;

  // See:

  // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang

  nsAutoString lang;

  if (!mLang.IsEmpty()) {

    lang = mLang;

  } else {

    nsCOMPtr<Document> document = GetOwnerWindow()->GetExtantDoc();

    if (!document) {

      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

      return false;

    nsCOMPtr<Element> element = document->GetRootElement();

    if (!element) {

      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

      return false;

    nsAutoString lang;

    element->GetLang(lang);

  auto result = CreateSpeechRecognitionService(GetOwnerWindow(), this, lang);

  if (result.isErr()) {

    switch (result.unwrapErr()) {

      case ServiceCreationError::ServiceNotFound:

        aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

        break;

      default:

        MOZ_CRASH("Unknown error");

    return false;

  mRecognitionService = result.unwrap();

  MOZ_DIAGNOSTIC_ASSERT(mRecognitionService);

  return true;

bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) {

  if (!mSpeechGrammarList) {

    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

    return false;

  uint32_t grammarListLength = mSpeechGrammarList->Length();

  for (uint32_t count = 0; count < grammarListLength; ++count) {

    RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);

    if (aRv.Failed()) {

      return false;

    if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList(

            speechGrammar.get(), nullptr))) {

      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

      return false;

  return true;

void SpeechRecognition::Stop() {

  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);

  NS_DispatchToMainThread(event);

void SpeechRecognition::Abort() {

  if (mAborted) {

    return;

  mAborted = true;

  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);

  NS_DispatchToMainThread(event);

void SpeechRecognition::NotifyTrackAdded(

    const RefPtr<MediaStreamTrack>& aTrack) {

  if (mTrack) {

    return;

  RefPtr<AudioStreamTrack> audioTrack = aTrack->AsAudioStreamTrack();

  if (!audioTrack) {

    return;

  if (audioTrack->Ended()) {

    return;

  StartRecording(audioTrack);

void SpeechRecognition::DispatchError(EventType aErrorType,

                                      SpeechRecognitionErrorCode aErrorCode,

                                      const nsACString& aMessage) {

  MOZ_ASSERT(NS_IsMainThread());

  MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||

                 aErrorType == EVENT_AUDIO_ERROR,

             "Invalid error type!");

  RefPtr<SpeechRecognitionError> srError =

      new SpeechRecognitionError(nullptr, nullptr, nullptr);

  srError->InitSpeechRecognitionError(u"error"_ns, true, false, aErrorCode,

                                      aMessage);

  RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);

  event->mError = srError;

  NS_DispatchToMainThread(event);

/*

 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.

 * Updates mBufferedSamples and returns the number of samples that were

 * buffered.

*/

uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,

                                              uint32_t aSampleCount) {

  MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);

  MOZ_ASSERT(mAudioSamplesBuffer);

  int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());

  size_t samplesToCopy =

      std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples);

  PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy);

  mBufferedSamples += samplesToCopy;

  return samplesToCopy;

/*

 * Split a samples buffer starting of a given size into

 * chunks of equal size. The chunks are stored in the array

 * received as argument.

 * Returns the offset of the end of the last chunk that was

 * created.

*/

uint32_t SpeechRecognition::SplitSamplesBuffer(

    const int16_t* aSamplesBuffer, uint32_t aSampleCount,

    nsTArray<RefPtr<SharedBuffer>>& aResult) {

  uint32_t chunkStart = 0;

  while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {

    CheckedInt<size_t> bufferSize(sizeof(int16_t));

    bufferSize *= mAudioSamplesPerChunk;

    RefPtr<SharedBuffer> chunk = SharedBuffer::Create(bufferSize);

    PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart,

            mAudioSamplesPerChunk);

    aResult.AppendElement(chunk.forget());

    chunkStart += mAudioSamplesPerChunk;

  return chunkStart;

AudioSegment* SpeechRecognition::CreateAudioSegment(

    nsTArray<RefPtr<SharedBuffer>>& aChunks) {

  AudioSegment* segment = new AudioSegment();

  for (uint32_t i = 0; i < aChunks.Length(); ++i) {

    RefPtr<SharedBuffer> buffer = aChunks[i];

    const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());

    AutoTArray<const int16_t*, 1> channels;

    channels.AppendElement(chunkData);

    segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk,

                          PRINCIPAL_HANDLE_NONE);

  return segment;

void SpeechRecognition::FeedAudioData(

    nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,

    already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration,

    MediaTrackListener* aProvider, TrackRate aTrackRate) {

  NS_ASSERTION(!NS_IsMainThread(),

               "FeedAudioData should not be called in the main thread");

  // Endpointer expects to receive samples in chunks whose size is a

  // multiple of its frame size.

  // Since we can't assume we will receive the frames in appropriate-sized

  // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk

  // (a multiple of Endpointer's frame size) before feeding to Endpointer.

  // ensure aSamples is deleted

  RefPtr<SharedBuffer> refSamples = aSamples;

  uint32_t samplesIndex = 0;

  const int16_t* samples = static_cast<int16_t*>(refSamples->Data());

  AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend;

  // fill up our buffer and make a chunk out of it, if possible

  if (mBufferedSamples > 0) {

    samplesIndex += FillSamplesBuffer(samples, aDuration);

    if (mBufferedSamples == mAudioSamplesPerChunk) {

      chunksToSend.AppendElement(mAudioSamplesBuffer.forget());

      mBufferedSamples = 0;

  // create sample chunks of correct size

  if (samplesIndex < aDuration) {

    samplesIndex += SplitSamplesBuffer(samples + samplesIndex,

                                       aDuration - samplesIndex, chunksToSend);

  // buffer remaining samples

  if (samplesIndex < aDuration) {

    mBufferedSamples = 0;

    CheckedInt<size_t> bufferSize(sizeof(int16_t));

    bufferSize *= mAudioSamplesPerChunk;

    mAudioSamplesBuffer = SharedBuffer::Create(bufferSize);

    FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);

  AudioSegment* segment = CreateAudioSegment(chunksToSend);

  RefPtr<SpeechEvent> event = new SpeechEvent(aRecognition, EVENT_AUDIO_DATA);

  event->mAudioSegment = segment;

  event->mProvider = aProvider;

  event->mTrackRate = aTrackRate;

  NS_DispatchToMainThread(event);

const char* SpeechRecognition::GetName(FSMState aId) {

  static const char* names[] = {

      "STATE_IDLE",        "STATE_STARTING",

      "STATE_ESTIMATING",  "STATE_WAITING_FOR_SPEECH",

      "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT",

      "STATE_ABORTING",

};

  MOZ_ASSERT(aId < STATE_COUNT);

  MOZ_ASSERT(std::size(names) == STATE_COUNT);

  return names[aId];

const char* SpeechRecognition::GetName(SpeechEvent* aEvent) {

  static const char* names[] = {"EVENT_START",

                                "EVENT_STOP",

                                "EVENT_ABORT",

                                "EVENT_AUDIO_DATA",

                                "EVENT_AUDIO_ERROR",

                                "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",

                                "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",

                                "EVENT_RECOGNITIONSERVICE_ERROR"};

  MOZ_ASSERT(aEvent->mType < EVENT_COUNT);

  MOZ_ASSERT(std::size(names) == EVENT_COUNT);

  return names[aEvent->mType];

TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const {

  MOZ_ASSERT(NS_IsMainThread());

  return mEncodeTaskQueue;

SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition,

                         SpeechRecognition::EventType aType)

    : Runnable("dom::SpeechEvent"),

      mAudioSegment(nullptr),

      mRecognitionResultList(nullptr),

      mError(nullptr),

      mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>(

          "SpeechEvent::SpeechEvent", aRecognition)),

      mType(aType),

      mTrackRate(0) {}

SpeechEvent::SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,

                         SpeechRecognition::EventType aType)

    : Runnable("dom::SpeechEvent"),

      mAudioSegment(nullptr),

      mRecognitionResultList(nullptr),

      mError(nullptr),

      mRecognition(aRecognition),

      mType(aType),

      mTrackRate(0) {}

SpeechEvent::~SpeechEvent() { delete mAudioSegment; }

NS_IMETHODIMP

SpeechEvent::Run() {

  mRecognition->ProcessEvent(this);

  return NS_OK;

}  // namespace mozilla::dom