Revision control

Copy as Markdown

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/
import AVFoundation
import Shared
import Speech
/// A speech transcription engine backed by `AVAudioEngine` and `SFSpeechRecognizer`.
///
/// Responsible for:
/// - Managing permissions
/// - Streaming transcription results
@MainActor
final class SFSpeechRecognizerEngine: TranscriptionEngine {
private let audioManager: AudioManagerProtocol
private let authorizer: AuthorizeProvider
private var recognitionTask: SFSpeechRecognitionTask?
private let speechRecognizer: SpeechRecognizerProvider?
private var request: SFSpeechAudioBufferRecognitionRequest?
init(
locale: Locale = Locale.current,
audioManager: AudioManagerProtocol,
speechRecognizer: SpeechRecognizerProvider? = nil,
authorizer: AuthorizeProvider
) {
self.audioManager = audioManager
self.speechRecognizer = speechRecognizer ?? SFSpeechRecognizer(locale: locale)
self.authorizer = authorizer
}
func prepare() async throws {
guard await isPermissionGranted() else {
throw SpeechError.permissionDenied
}
try audioManager.configureAudioSession()
}
func start(continuation: AsyncThrowingStream<SpeechResult, any Error>.Continuation) async throws {
let recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
recognitionRequest.requiresOnDeviceRecognition = true
self.request = recognitionRequest
// Start capturing audio and append buffers to the recognition request
try audioManager.startCapture(bufferSize: 4096) { buffer, _ in
recognitionRequest.append(buffer)
}
try audioManager.prepareAndStartEngine()
// Start recognizer task if available
guard let speechRecognizer, speechRecognizer.isAvailable else {
throw SpeechError.recognizerNotAvailable
}
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
if let error {
continuation.finish(throwing: error)
return
}
guard let result else { return }
let chunk = result.bestTranscription.formattedString
let formattedWords = chunk
.components(separatedBy: .whitespacesAndNewlines)
.filter { !$0.isEmpty }
// `isFinal` is not reliable so we add extra checks
// such as a confidence level as well as a cap of 50 words.
// From manual testing, the confidence level is usually > 0.0 when user finishes speaking.
let confidence = result.bestTranscription.segments[safe: 0]?.confidence ?? 0.0
let additionalChecks = confidence > 0.0 || formattedWords.count >= 50
let shouldFinish = result.isFinal || result.speechRecognitionMetadata != nil || additionalChecks
let speechResult = SpeechResult(
text: chunk,
isFinal: shouldFinish
)
continuation.yield(speechResult)
if shouldFinish {
continuation.finish()
}
}
}
func stop() {
audioManager.stopEngine()
request?.endAudio()
recognitionTask?.finish()
}
private func isPermissionGranted() async -> Bool {
let isMicAuthorized = await authorizer.isMicrophonePermissionAuthorized()
let isSpeechAuthorized = await authorizer.isSpeechPermissionAuthorized()
return isMicAuthorized && isSpeechAuthorized
}
}