Revision control
Copy as Markdown
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/
// TODO: FXIOS-14934 - remove preconcurrency
import AVFoundation
import Speech
import Common
import CoreMedia
/// A transcription engine built on iOS 26's `SpeechAnalyzer` + `SpeechTranscriber`.
///
/// Responsibilities:
/// - Request/check microphone + speech permissions
/// - Stream transcription results through an `AsyncThrowingStream` continuation
///
/// This type is an `@MainActor` class to keep audio/transcription state safe across concurrent calls.
@available(iOS 26.0, *)
@MainActor
final class SpeechAnalyzerEngine: TranscriptionEngine {
private let audioManager: AudioManagerProtocol
private let authorizer: AuthorizeProvider
private let locale: Locale
private var analyzer: SpeechAnalyzer?
private var transcriber: SpeechTranscriber?
private var inputContinuation: AsyncStream<AnalyzerInput>.Continuation?
private var resultsTask: Task<Void, Error>?
init(
locale: Locale = Locale.current,
audioManager: AudioManagerProtocol,
authorizer: AuthorizeProvider
) {
self.audioManager = audioManager
self.authorizer = authorizer
self.locale = locale
}
func prepare() async throws {
guard await isPermissionGranted() else {
throw SpeechError.permissionDenied
}
try audioManager.configureAudioSession()
}
/// Starts transcription and streams results through `continuation`.
///
/// This method:
/// 1) resolves a supported locale
/// 2) creates a transcriber + analyzer
/// 3) ensures the speech model is installed (downloads if needed)
/// 4) prepares the analyzer with a compatible audio format
/// 5) starts analyzer + results tasks
/// 6) starts microphone capture and feeds audio buffers into the analyzer input stream
///
/// - Parameter continuation: Receives incremental and final `SpeechResult` values.
func start(continuation: AsyncThrowingStream<SpeechResult, any Error>.Continuation) async throws {
// TODO: Use LocaleProvider instead
let resolvedLocale = try await resolveLocale(with: locale)
let transcriber = SpeechTranscriber(
locale: resolvedLocale,
transcriptionOptions: [],
reportingOptions: [.volatileResults],
attributeOptions: [.transcriptionConfidence]
)
self.transcriber = transcriber
try await ensureModelAvailable(transcriber: transcriber, locale: resolvedLocale)
let analyzer = SpeechAnalyzer(modules: [transcriber])
self.analyzer = analyzer
let targetFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber])
guard let targetFormat else {
throw SpeechError.noAudioFormat
}
try await analyzer.prepareToAnalyze(in: targetFormat)
// Build analyzer input stream; mic capture yields audio buffers into `inputContinuation`.
let stream = AsyncStream<AnalyzerInput> { continuation in
self.inputContinuation = continuation
}
try await analyzer.start(inputSequence: stream)
resultsTask = Task { [weak self] in
guard let self, let transcriber = self.transcriber else { return }
do {
for try await result in transcriber.results {
let chunk = String(result.text.characters)
let speechResult = SpeechResult(
text: chunk,
isFinal: result.isFinal
)
continuation.yield(speechResult)
if result.isFinal {
continuation.finish()
}
}
} catch {
continuation.finish(throwing: error)
}
}
// Start microphone capture and feed `AnalyzerInput(buffer:)` into the stream.
guard let continuation = inputContinuation else {
throw SpeechError.noInputContinuation
}
try audioManager.startCapture(targetFormat: targetFormat, bufferSize: 4096) { buffer in
continuation.yield(AnalyzerInput(buffer: buffer))
}
try audioManager.prepareAndStartEngine()
}
func stop() async throws {
audioManager.stopEngine()
inputContinuation?.finish()
inputContinuation = nil
try await analyzer?.finalizeAndFinishThroughEndOfInput()
resultsTask = nil
transcriber = nil
analyzer = nil
}
private func isPermissionGranted() async -> Bool {
let isMicAuthorized = await authorizer.isMicrophonePermissionAuthorized()
let isSpeechAuthorized = await authorizer.isSpeechPermissionAuthorized()
return isMicAuthorized && isSpeechAuthorized
}
private func resolveLocale(with currentLocale: Locale) async throws -> Locale {
if let supported = await SpeechTranscriber.supportedLocale(equivalentTo: currentLocale) {
return supported
} else {
throw SpeechError.unableToSupportLocale
}
}
/// Ensures a speech model is available for `locale`.
///
/// If the locale is supported but not installed, this will download and install the model.
private func ensureModelAvailable(transcriber: SpeechTranscriber, locale: Locale) async throws {
guard await supported(locale: locale) else {
throw SpeechError.unableToSupportLocale
}
if await installed(locale: locale) {
return
} else {
try await downloadIfNeeded(for: transcriber)
}
}
private func supported(locale: Locale) async -> Bool {
let supported = await SpeechTranscriber.supportedLocales
return supported.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47))
}
private func installed(locale: Locale) async -> Bool {
let installed = await Set(SpeechTranscriber.installedLocales)
return installed.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47))
}
private func downloadIfNeeded(for module: SpeechTranscriber) async throws {
if let downloader = try await AssetInventory.assetInstallationRequest(supporting: [module]) {
try await downloader.downloadAndInstall()
}
}
}