OSXSpeechSynthesizerService.mm

firefox-main/dom/media/webspeech/synth/cocoa/OSXSpeechSynthesizerService.mm (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Web Speech

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: Objective-C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset:

 * 2 -*- */

/* vim: set ts=2 sw=2 et tw=80: */

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "OSXSpeechSynthesizerService.h"

#include "mozilla/Assertions.h"

#include "mozilla/ClearOnShutdown.h"

#include "mozilla/Preferences.h"

#include "mozilla/StaticPrefs_media.h"

#include "mozilla/dom/nsSpeechTask.h"

#include "mozilla/dom/nsSynthVoiceRegistry.h"

#include "nsCocoaUtils.h"

#include "nsISupports.h"

#include "nsIThread.h"

#include "nsObjCExceptions.h"

#include "nsServiceManagerUtils.h"

#include "nsThreadUtils.h"

#include "nsXULAppAPI.h"

#import <Cocoa/Cocoa.h>

@class SpeechDelegate;

// We can escape the default delimiters ("[[" and "]]") by temporarily

// changing the delimiters just before they appear, and changing them back

// just after.

#define DLIM_ESCAPE_START "[[dlim (( ))]]"

#define DLIM_ESCAPE_END "((dlim [[ ]]))"

using namespace mozilla;

class SpeechTaskCallback final : public nsISpeechTaskCallback {

 public:

  SpeechTaskCallback(nsISpeechTask* aTask, NSSpeechSynthesizer* aSynth,

                     const nsTArray<size_t>& aOffsets);

  NS_DECL_CYCLE_COLLECTING_ISUPPORTS

  NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SpeechTaskCallback,

                                           nsISpeechTaskCallback)

  NS_DECL_NSISPEECHTASKCALLBACK

  void OnWillSpeakWord(uint32_t aIndex, uint32_t aLength);

  void OnError(uint32_t aIndex);

  void OnDidFinishSpeaking();

 private:

  virtual ~SpeechTaskCallback();

  float GetTimeDurationFromStart();

  nsCOMPtr<nsISpeechTask> mTask;

  NSSpeechSynthesizer* mSpeechSynthesizer;

  SpeechDelegate* mDelegate;

  TimeStamp mStartingTime;

  uint32_t mCurrentIndex;

  nsTArray<size_t> mOffsets;

};

@interface SpeechDelegate : NSObject <NSSpeechSynthesizerDelegate> {

 @private

  SpeechTaskCallback* mCallback;

- (id)initWithCallback:(SpeechTaskCallback*)aCallback;

@end

@implementation SpeechDelegate

- (id)initWithCallback:(SpeechTaskCallback*)aCallback {

  [super init];

  mCallback = aCallback;

  return self;

- (void)speechSynthesizer:(NSSpeechSynthesizer*)aSender

            willSpeakWord:(NSRange)aRange

                 ofString:(NSString*)aString {

  mCallback->OnWillSpeakWord(aRange.location, aRange.length);

- (void)speechSynthesizer:(NSSpeechSynthesizer*)aSender

        didFinishSpeaking:(BOOL)aFinishedSpeaking {

  mCallback->OnDidFinishSpeaking();

- (void)speechSynthesizer:(NSSpeechSynthesizer*)aSender

    didEncounterErrorAtIndex:(NSUInteger)aCharacterIndex

                    ofString:(NSString*)aString

                     message:(NSString*)aMessage {

  mCallback->OnError(aCharacterIndex);

@end

NS_IMPL_CYCLE_COLLECTION(SpeechTaskCallback, mTask);

NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechTaskCallback)

  NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback)

  NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback)

NS_INTERFACE_MAP_END

NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechTaskCallback)

NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechTaskCallback)

SpeechTaskCallback::SpeechTaskCallback(nsISpeechTask* aTask,

                                       NSSpeechSynthesizer* aSynth,

                                       const nsTArray<size_t>& aOffsets)

    : mTask(aTask),

      mSpeechSynthesizer(aSynth),

      mCurrentIndex(0),

      mOffsets(aOffsets.Clone()) {

  mDelegate = [[SpeechDelegate alloc] initWithCallback:this];

  [mSpeechSynthesizer setDelegate:mDelegate];

  mStartingTime = TimeStamp::Now();

SpeechTaskCallback::~SpeechTaskCallback() {

  [mSpeechSynthesizer setDelegate:nil];

  [mDelegate release];

  [mSpeechSynthesizer release];

NS_IMETHODIMP

SpeechTaskCallback::OnCancel() {

  NS_OBJC_BEGIN_TRY_BLOCK_RETURN;

  [mSpeechSynthesizer stopSpeaking];

  return NS_OK;

  NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE);

NS_IMETHODIMP

SpeechTaskCallback::OnPause() {

  NS_OBJC_BEGIN_TRY_BLOCK_RETURN;

  [mSpeechSynthesizer pauseSpeakingAtBoundary:NSSpeechImmediateBoundary];

  if (!mTask) {

    // When calling pause() on child porcess, it may not receive end event

    // from chrome process yet.

    return NS_ERROR_FAILURE;

  mTask->DispatchPause(GetTimeDurationFromStart(), mCurrentIndex);

  return NS_OK;

  NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE);

NS_IMETHODIMP

SpeechTaskCallback::OnResume() {

  NS_OBJC_BEGIN_TRY_BLOCK_RETURN;

  [mSpeechSynthesizer continueSpeaking];

  if (!mTask) {

    // When calling resume() on child porcess, it may not receive end event

    // from chrome process yet.

    return NS_ERROR_FAILURE;

  mTask->DispatchResume(GetTimeDurationFromStart(), mCurrentIndex);

  return NS_OK;

  NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE);

NS_IMETHODIMP

SpeechTaskCallback::OnVolumeChanged(float aVolume) {

  NS_OBJC_BEGIN_TRY_BLOCK_RETURN;

  [mSpeechSynthesizer setObject:[NSNumber numberWithFloat:aVolume]

                    forProperty:NSSpeechVolumeProperty

                          error:nil];

  return NS_OK;

  NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE);

float SpeechTaskCallback::GetTimeDurationFromStart() {

  TimeDuration duration = TimeStamp::Now() - mStartingTime;

  return duration.ToSeconds();

void SpeechTaskCallback::OnWillSpeakWord(uint32_t aIndex, uint32_t aLength) {

  mCurrentIndex = aIndex < mOffsets.Length() ? mOffsets[aIndex] : mCurrentIndex;

  if (!mTask) {

    return;

  mTask->DispatchBoundary(u"word"_ns, GetTimeDurationFromStart(), mCurrentIndex,

                          aLength, 1);

void SpeechTaskCallback::OnError(uint32_t aIndex) {

  if (!mTask) {

    return;

  mTask->DispatchError(GetTimeDurationFromStart(), aIndex);

void SpeechTaskCallback::OnDidFinishSpeaking() {

  mTask->DispatchEnd(GetTimeDurationFromStart(), mCurrentIndex);

  // no longer needed

  [mSpeechSynthesizer setDelegate:nil];

  mTask = nullptr;

namespace mozilla {

namespace dom {

struct OSXVoice {

  OSXVoice() : mIsDefault(false) {}

  nsString mUri;

  nsString mName;

  nsString mLocale;

  bool mIsDefault;

};

class RegisterVoicesRunnable final : public Runnable {

 public:

  RegisterVoicesRunnable(OSXSpeechSynthesizerService* aSpeechService,

                         nsTArray<OSXVoice>& aList)

      : Runnable("RegisterVoicesRunnable"),

        mSpeechService(aSpeechService),

        mVoices(aList) {}

  NS_IMETHOD Run() override;

 private:

  ~RegisterVoicesRunnable() override = default;

  // This runnable always use sync mode.  It is unnecesarry to reference object

  OSXSpeechSynthesizerService* mSpeechService;

  nsTArray<OSXVoice>& mVoices;

};

NS_IMETHODIMP

RegisterVoicesRunnable::Run() {

  nsresult rv;

  nsCOMPtr<nsISynthVoiceRegistry> registry =

      do_GetService(NS_SYNTHVOICEREGISTRY_CONTRACTID, &rv);

  if (!registry) {

    return rv;

  for (OSXVoice voice : mVoices) {

    rv = registry->AddVoice(mSpeechService, voice.mUri, voice.mName,

                            voice.mLocale, true, false);

    if (NS_WARN_IF(NS_FAILED(rv))) {

      continue;

    if (voice.mIsDefault) {

      registry->SetDefaultVoice(voice.mUri, true);

  registry->NotifyVoicesChanged();

  return NS_OK;

class EnumVoicesRunnable final : public Runnable {

 public:

  explicit EnumVoicesRunnable(OSXSpeechSynthesizerService* aSpeechService)

      : Runnable("EnumVoicesRunnable"), mSpeechService(aSpeechService) {}

  NS_IMETHOD Run() override;

 private:

  ~EnumVoicesRunnable() override = default;

  RefPtr<OSXSpeechSynthesizerService> mSpeechService;

};

NS_IMETHODIMP

EnumVoicesRunnable::Run() {

  NS_OBJC_BEGIN_TRY_BLOCK_RETURN;

  AutoTArray<OSXVoice, 64> list;

  NSArray* voices = [NSSpeechSynthesizer availableVoices];

  NSString* defaultVoice = [NSSpeechSynthesizer defaultVoice];

  for (NSString* voice in voices) {

    OSXVoice item;

    NSDictionary* attr = [NSSpeechSynthesizer attributesForVoice:voice];

    nsAutoString identifier;

    nsCocoaUtils::GetStringForNSString([attr objectForKey:NSVoiceIdentifier],

                                       identifier);

    nsCocoaUtils::GetStringForNSString([attr objectForKey:NSVoiceName],

                                       item.mName);

    nsCocoaUtils::GetStringForNSString(

        [attr objectForKey:NSVoiceLocaleIdentifier], item.mLocale);

    item.mLocale.ReplaceChar('_', '-');

    item.mUri.AssignLiteral("urn:moz-tts:osx:");

    item.mUri.Append(identifier);

    if ([voice isEqualToString:defaultVoice]) {

      item.mIsDefault = true;

    list.AppendElement(item);

  RefPtr<RegisterVoicesRunnable> runnable =

      new RegisterVoicesRunnable(mSpeechService, list);

  NS_DispatchAndSpinEventLoopUntilComplete("EnumVoicesRunnable"_ns,

                                           GetMainThreadSerialEventTarget(),

                                           runnable.forget());

  return NS_OK;

  NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE);

StaticRefPtr<OSXSpeechSynthesizerService>

    OSXSpeechSynthesizerService::sSingleton;

NS_INTERFACE_MAP_BEGIN(OSXSpeechSynthesizerService)

  NS_INTERFACE_MAP_ENTRY(nsISpeechService)

  NS_INTERFACE_MAP_ENTRY(nsIObserver)

  NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechService)

NS_INTERFACE_MAP_END

NS_IMPL_ADDREF(OSXSpeechSynthesizerService)

NS_IMPL_RELEASE(OSXSpeechSynthesizerService)

OSXSpeechSynthesizerService::OSXSpeechSynthesizerService()

    : mInitialized(false) {}

bool OSXSpeechSynthesizerService::Init() {

  if (Preferences::GetBool("media.webspeech.synth.test") ||

      !StaticPrefs::media_webspeech_synth_enabled()) {

    // When test is enabled, we shouldn't add OS backend (Bug 1160844)

    return false;

  nsCOMPtr<nsIThread> thread;

  if (NS_FAILED(NS_NewNamedThread("SpeechWorker", getter_AddRefs(thread)))) {

    return false;

  // Get all the voices and register in the SynthVoiceRegistry

  nsCOMPtr<nsIRunnable> runnable = new EnumVoicesRunnable(this);

  thread->Dispatch(runnable, NS_DISPATCH_NORMAL);

  mInitialized = true;

  return true;

NS_IMETHODIMP

OSXSpeechSynthesizerService::Speak(const nsAString& aText,

                                   const nsAString& aUri, float aVolume,

                                   float aRate, float aPitch,

                                   nsISpeechTask* aTask) {

  NS_OBJC_BEGIN_TRY_BLOCK_RETURN;

  MOZ_ASSERT(StringBeginsWith(aUri, u"urn:moz-tts:osx:"_ns),

             "OSXSpeechSynthesizerService doesn't allow this voice URI");

  NSSpeechSynthesizer* synth = [[NSSpeechSynthesizer alloc] init];

  // strlen("urn:moz-tts:osx:") == 16

  NSString* identifier = nsCocoaUtils::ToNSString(Substring(aUri, 16));

  [synth setVoice:identifier];

  // default rate is 180-220

  [synth setObject:[NSNumber numberWithInt:aRate * 200]

       forProperty:NSSpeechRateProperty

             error:nil];

  // volume allows 0.0-1.0

  [synth setObject:[NSNumber numberWithFloat:aVolume]

       forProperty:NSSpeechVolumeProperty

             error:nil];

  // Use default pitch value to calculate this

  NSNumber* defaultPitch = [synth objectForProperty:NSSpeechPitchBaseProperty

                                              error:nil];

  if (defaultPitch) {

    int newPitch = [defaultPitch intValue] * (aPitch / 2 + 0.5);

    [synth setObject:[NSNumber numberWithInt:newPitch]

         forProperty:NSSpeechPitchBaseProperty

               error:nil];

  nsAutoString escapedText;

  // We need to map the the offsets from the given text to the escaped text.

  // The index of the offsets array is the position in the escaped text,

  // the element value is the position in the user-supplied text.

  nsTArray<size_t> offsets;

  offsets.SetCapacity(aText.Length());

  // This loop looks for occurances of "[[" or "]]", escapes them, and

  // populates the offsets array to supply a map to the original offsets.

  for (size_t i = 0; i < aText.Length(); i++) {

    if (aText.Length() > i + 1 && ((aText[i] == ']' && aText[i + 1] == ']') ||

                                   (aText[i] == '[' && aText[i + 1] == '['))) {

      escapedText.AppendLiteral(DLIM_ESCAPE_START);

      offsets.AppendElements(strlen(DLIM_ESCAPE_START));

      escapedText.Append(aText[i]);

      offsets.AppendElement(i);

      escapedText.Append(aText[++i]);

      offsets.AppendElement(i);

      escapedText.AppendLiteral(DLIM_ESCAPE_END);

      offsets.AppendElements(strlen(DLIM_ESCAPE_END));

    } else {

      escapedText.Append(aText[i]);

      offsets.AppendElement(i);

  RefPtr<SpeechTaskCallback> callback =

      new SpeechTaskCallback(aTask, synth, offsets);

  nsresult rv = aTask->Setup(callback);

  NS_ENSURE_SUCCESS(rv, rv);

  NSString* text = nsCocoaUtils::ToNSString(escapedText);

  BOOL success = [synth startSpeakingString:text];

  NS_ENSURE_TRUE(success, NS_ERROR_FAILURE);

  aTask->DispatchStart();

  return NS_OK;

  NS_OBJC_END_TRY_BLOCK_RETURN(NS_ERROR_FAILURE);

NS_IMETHODIMP

OSXSpeechSynthesizerService::Observe(nsISupports* aSubject, const char* aTopic,

                                     const char16_t* aData) {

  return NS_OK;

OSXSpeechSynthesizerService* OSXSpeechSynthesizerService::GetInstance() {

  MOZ_ASSERT(NS_IsMainThread());

  if (XRE_GetProcessType() != GeckoProcessType_Default) {

    return nullptr;

  if (!sSingleton) {

    RefPtr<OSXSpeechSynthesizerService> speechService =

        new OSXSpeechSynthesizerService();

    if (speechService->Init()) {

      sSingleton = speechService;

      ClearOnShutdown(&sSingleton);

  return sSingleton;

already_AddRefed<OSXSpeechSynthesizerService>

OSXSpeechSynthesizerService::GetInstanceForService() {

  RefPtr<OSXSpeechSynthesizerService> speechService = GetInstance();

  return speechService.forget();

}  // namespace dom

}  // namespace mozilla