initial work to remove speech to text controller from pangea controller

This commit is contained in:
ggurdin 2025-12-03 12:07:36 -05:00
parent 4dc948d197
commit a56ba59316
No known key found for this signature in database
GPG key ID: A01CB41737CBB478
10 changed files with 291 additions and 334 deletions

View file

@ -21,8 +21,8 @@ import 'package:fluffychat/pangea/learning_settings/controllers/language_control
import 'package:fluffychat/pangea/learning_settings/utils/locale_provider.dart';
import 'package:fluffychat/pangea/learning_settings/utils/p_language_store.dart';
import 'package:fluffychat/pangea/spaces/controllers/space_code_controller.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_controller.dart';
import 'package:fluffychat/pangea/subscription/controllers/subscription_controller.dart';
import 'package:fluffychat/pangea/toolbar/controllers/speech_to_text_controller.dart';
import 'package:fluffychat/pangea/toolbar/controllers/text_to_speech_controller.dart';
import 'package:fluffychat/pangea/toolbar/controllers/tts_controller.dart';
import 'package:fluffychat/pangea/user/controllers/permissions_controller.dart';

View file

@ -19,9 +19,10 @@ import 'package:fluffychat/pangea/events/repo/language_detection_request.dart';
import 'package:fluffychat/pangea/events/repo/language_detection_response.dart';
import 'package:fluffychat/pangea/learning_settings/utils/p_language_store.dart';
import 'package:fluffychat/pangea/spaces/models/space_model.dart';
import 'package:fluffychat/pangea/speech_to_text/audio_encoding_enum.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_request_model.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_response_model.dart';
import 'package:fluffychat/pangea/toolbar/controllers/text_to_speech_controller.dart';
import 'package:fluffychat/pangea/toolbar/enums/audio_encoding_enum.dart';
import 'package:fluffychat/pangea/toolbar/models/speech_to_text_models.dart';
import 'package:fluffychat/pangea/toolbar/widgets/message_audio_card.dart';
import 'package:fluffychat/pangea/translation/full_text_translation_request_model.dart';
import 'package:fluffychat/widgets/future_loading_dialog.dart';
@ -227,13 +228,13 @@ class PangeaMessageEvent {
null;
}).toSet();
SpeechToTextModel? getSpeechToTextLocal() {
SpeechToTextResponseModel? getSpeechToTextLocal() {
final rawBotTranscription =
event.content.tryGetMap(ModelKey.botTranscription);
if (rawBotTranscription != null) {
try {
return SpeechToTextModel.fromJson(
return SpeechToTextResponseModel.fromJson(
Map<String, dynamic>.from(rawBotTranscription),
);
} catch (err, s) {
@ -257,7 +258,7 @@ class PangeaMessageEvent {
.speechToText;
}
Future<SpeechToTextModel> getSpeechToText(
Future<SpeechToTextResponseModel> getSpeechToText(
String l1Code,
String l2Code,
) async {
@ -268,7 +269,8 @@ class PangeaMessageEvent {
final rawBotTranscription =
event.content.tryGetMap(ModelKey.botTranscription);
if (rawBotTranscription != null) {
final SpeechToTextModel botTranscription = SpeechToTextModel.fromJson(
final SpeechToTextResponseModel botTranscription =
SpeechToTextResponseModel.fromJson(
Map<String, dynamic>.from(rawBotTranscription),
);
@ -290,7 +292,7 @@ class PangeaMessageEvent {
return botTranscription;
}
final SpeechToTextModel? speechToTextLocal = representations
final SpeechToTextResponseModel? speechToTextLocal = representations
.firstWhereOrNull(
(element) => element.content.speechToText != null,
)
@ -303,7 +305,7 @@ class PangeaMessageEvent {
final matrixFile = await _event.downloadAndDecryptAttachment();
final SpeechToTextModel response =
final SpeechToTextResponseModel response =
await MatrixState.pangeaController.speechToText.get(
SpeechToTextRequestModel(
audioContent: matrixFile.bytes,

View file

@ -8,7 +8,7 @@ import 'package:fluffychat/pangea/choreographer/choreo_record_model.dart';
import 'package:fluffychat/pangea/choreographer/igc/pangea_match_status_enum.dart';
import 'package:fluffychat/pangea/common/utils/error_handler.dart';
import 'package:fluffychat/pangea/events/models/pangea_token_model.dart';
import 'package:fluffychat/pangea/toolbar/models/speech_to_text_models.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_response_model.dart';
import 'package:fluffychat/widgets/matrix.dart';
/// this class is contained within a [RepresentationEvent]
@ -30,7 +30,7 @@ class PangeaRepresentation {
bool originalWritten;
// a representation can be create via speech to text on the original message
SpeechToTextModel? speechToText;
SpeechToTextResponseModel? speechToText;
// how do we know which representation was sent by author?
// RepresentationEvent.text == PangeaMessageEvent.event.body
@ -70,7 +70,7 @@ class PangeaRepresentation {
originalWritten: json[_originalWrittenKey] ?? false,
speechToText: json[_speechToTextKey] == null
? null
: SpeechToTextModel.fromJson(json[_speechToTextKey]),
: SpeechToTextResponseModel.fromJson(json[_speechToTextKey]),
);
}

View file

@ -26,11 +26,8 @@ enum AudioEncodingEnum {
speexWithHeaderByte,
mp3,
mp4,
webmOpus,
}
webmOpus;
// Utility extension to map enum values to their corresponding string value as used by the API
extension AudioEncodingExtension on AudioEncodingEnum {
String get value {
switch (this) {
case AudioEncodingEnum.linear16:

View file

@ -0,0 +1,105 @@
import 'dart:convert';
import 'package:async/async.dart';
import 'package:http/http.dart';
import 'package:fluffychat/pangea/common/config/environment.dart';
import 'package:fluffychat/pangea/common/network/requests.dart';
import 'package:fluffychat/pangea/common/network/urls.dart';
import 'package:fluffychat/pangea/common/utils/error_handler.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_request_model.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_response_model.dart';
class _SpeechToTextCacheItem {
final Future<SpeechToTextResponseModel> data;
final DateTime timestamp;
const _SpeechToTextCacheItem({
required this.data,
required this.timestamp,
});
}
class SpeechToTextRepo {
static final Map<String, _SpeechToTextCacheItem> _cache = {};
static const Duration _cacheDuration = Duration(minutes: 10);
static Future<Result<SpeechToTextResponseModel>> get(
String accessToken,
SpeechToTextRequestModel request,
) {
final cached = _getCached(request);
if (cached != null) {
return _getResult(request, cached);
}
final future = _fetch(accessToken, request);
_setCached(request, future);
return _getResult(request, future);
}
static Future<SpeechToTextResponseModel> _fetch(
String accessToken,
SpeechToTextRequestModel request,
) async {
final Requests req = Requests(
choreoApiKey: Environment.choreoApiKey,
accessToken: accessToken,
);
final Response res = await req.post(
url: PApiUrls.simpleTranslation,
body: request.toJson(),
);
if (res.statusCode != 200) {
throw Exception(
'Failed to translate text: ${res.statusCode} ${res.reasonPhrase}',
);
}
return SpeechToTextResponseModel.fromJson(
jsonDecode(utf8.decode(res.bodyBytes)),
);
}
static Future<Result<SpeechToTextResponseModel>> _getResult(
SpeechToTextRequestModel request,
Future<SpeechToTextResponseModel> future,
) async {
try {
final res = await future;
return Result.value(res);
} catch (e, s) {
_cache.remove(request.hashCode.toString());
ErrorHandler.logError(
e: e,
s: s,
data: request.toJson(),
);
return Result.error(e);
}
}
static Future<SpeechToTextResponseModel>? _getCached(
SpeechToTextRequestModel request,
) {
final cacheKeys = [..._cache.keys];
for (final key in cacheKeys) {
if (DateTime.now().difference(_cache[key]!.timestamp) >= _cacheDuration) {
_cache.remove(key);
}
}
return _cache[request.hashCode.toString()]?.data;
}
static void _setCached(
SpeechToTextRequestModel request,
Future<SpeechToTextResponseModel> response,
) =>
_cache[request.hashCode.toString()] = _SpeechToTextCacheItem(
data: response,
timestamp: DateTime.now(),
);
}

View file

@ -0,0 +1,70 @@
import 'dart:convert';
import 'package:flutter/foundation.dart';
import 'package:matrix/matrix.dart';
import 'package:fluffychat/pangea/speech_to_text/audio_encoding_enum.dart';
class SpeechToTextRequestModel {
final Uint8List audioContent;
final SpeechToTextAudioConfigModel config;
final Event? audioEvent;
SpeechToTextRequestModel({
required this.audioContent,
required this.config,
this.audioEvent,
});
Map<String, dynamic> toJson() => {
"audio_content": base64Encode(audioContent),
"config": config.toJson(),
};
@override
bool operator ==(Object other) {
if (identical(this, other)) return true;
if (other is! SpeechToTextRequestModel) return false;
return listEquals(audioContent, other.audioContent) &&
config == other.config;
}
@override
int get hashCode {
final bytesSample =
audioContent.length > 10 ? audioContent.sublist(0, 10) : audioContent;
return Object.hashAll([
Object.hashAll(bytesSample),
config.hashCode,
]);
}
}
class SpeechToTextAudioConfigModel {
final AudioEncodingEnum encoding;
final int sampleRateHertz;
final bool enableWordConfidence;
final bool enableAutomaticPunctuation;
final String userL1;
final String userL2;
SpeechToTextAudioConfigModel({
required this.encoding,
required this.userL1,
required this.userL2,
this.sampleRateHertz = 16000,
this.enableWordConfidence = true,
this.enableAutomaticPunctuation = true,
});
Map<String, dynamic> toJson() => {
"encoding": encoding.value,
"sample_rate_hertz": sampleRateHertz,
"user_l1": userL1,
"user_l2": userL2,
"enable_word_confidence": enableWordConfidence,
"enable_automatic_punctuation": enableAutomaticPunctuation,
};
}

View file

@ -1,79 +1,118 @@
import 'dart:convert';
import 'package:flutter/foundation.dart';
import 'package:flutter/material.dart';
import 'package:matrix/matrix.dart';
import 'package:fluffychat/config/app_config.dart';
import 'package:fluffychat/pangea/analytics_misc/construct_use_type_enum.dart';
import 'package:fluffychat/pangea/analytics_misc/constructs_model.dart';
import 'package:fluffychat/pangea/events/models/pangea_token_model.dart';
import 'package:fluffychat/pangea/toolbar/enums/audio_encoding_enum.dart';
const int thresholdForGreen = 80;
class SpeechToTextResponseModel {
final List<SpeechToTextResult> results;
class SpeechToTextAudioConfigModel {
final AudioEncodingEnum encoding;
final int sampleRateHertz;
final bool enableWordConfidence;
final bool enableAutomaticPunctuation;
final String userL1;
final String userL2;
SpeechToTextAudioConfigModel({
required this.encoding,
required this.userL1,
required this.userL2,
this.sampleRateHertz = 16000,
this.enableWordConfidence = true,
this.enableAutomaticPunctuation = true,
SpeechToTextResponseModel({
required this.results,
});
Transcript get transcript => results.first.transcripts.first;
String get langCode => results.first.transcripts.first.langCode;
factory SpeechToTextResponseModel.fromJson(Map<String, dynamic> json) {
final results = json['results'] as List;
if (results.isEmpty) {
throw Exception('SpeechToTextModel.fromJson: results is empty');
}
return SpeechToTextResponseModel(
results: (json['results'] as List)
.map((e) => SpeechToTextResult.fromJson(e))
.toList(),
);
}
Map<String, dynamic> toJson() => {
"encoding": encoding.value,
"sample_rate_hertz": sampleRateHertz,
"user_l1": userL1,
"user_l2": userL2,
"enable_word_confidence": enableWordConfidence,
"enable_automatic_punctuation": enableAutomaticPunctuation,
"results": results.map((e) => e.toJson()).toList(),
};
List<OneConstructUse> constructs(
String roomId,
String eventId,
) {
final List<OneConstructUse> constructs = [];
final metadata = ConstructUseMetaData(
roomId: roomId,
eventId: eventId,
timeStamp: DateTime.now(),
);
for (final sstToken in transcript.sttTokens) {
final token = sstToken.token;
if (!token.lemma.saveVocab) continue;
constructs.addAll(
token.allUses(
ConstructUseTypeEnum.pvm,
metadata,
ConstructUseTypeEnum.pvm.pointValue,
),
);
}
return constructs;
}
}
class SpeechToTextResult {
final List<Transcript> transcripts;
SpeechToTextResult({required this.transcripts});
factory SpeechToTextResult.fromJson(Map<String, dynamic> json) =>
SpeechToTextResult(
transcripts: (json['transcripts'] as List)
.map((e) => Transcript.fromJson(e))
.toList(),
);
Map<String, dynamic> toJson() => {
"transcripts": transcripts.map((e) => e.toJson()).toList(),
};
}
class SpeechToTextRequestModel {
final Uint8List audioContent;
final SpeechToTextAudioConfigModel config;
final Event? audioEvent;
class Transcript {
final String text;
final int confidence;
final List<STTToken> sttTokens;
final String langCode;
final int? wordsPerHr;
SpeechToTextRequestModel({
required this.audioContent,
required this.config,
this.audioEvent,
Transcript({
required this.text,
required this.confidence,
required this.sttTokens,
required this.langCode,
required this.wordsPerHr,
});
/// Returns the number of words per minute rounded to one decimal place.
double? get wordsPerMinute => wordsPerHr != null ? wordsPerHr! / 60 : null;
factory Transcript.fromJson(Map<String, dynamic> json) => Transcript(
text: json['transcript'],
confidence: json['confidence'] <= 100
? json['confidence']
: json['confidence'] / 100,
sttTokens: (json['stt_tokens'] as List)
.map((e) => STTToken.fromJson(e))
.toList(),
langCode: json['lang_code'],
wordsPerHr: json['words_per_hr'],
);
Map<String, dynamic> toJson() => {
"audio_content": base64Encode(audioContent),
"config": config.toJson(),
"transcript": text,
"confidence": confidence,
"stt_tokens": sttTokens.map((e) => e.toJson()).toList(),
"lang_code": langCode,
"words_per_hr": wordsPerHr,
};
@override
bool operator ==(Object other) {
if (identical(this, other)) return true;
if (other is! SpeechToTextRequestModel) return false;
return listEquals(audioContent, other.audioContent) &&
config == other.config;
}
@override
int get hashCode {
final bytesSample =
audioContent.length > 10 ? audioContent.sublist(0, 10) : audioContent;
return Object.hashAll([
Object.hashAll(bytesSample),
config.hashCode,
]);
}
Color get color => confidence > 80 ? AppConfig.success : AppConfig.warning;
}
class STTToken {
@ -94,15 +133,7 @@ class STTToken {
int get length => token.text.length;
Color color(BuildContext context) {
// turning off the color coding for now
// whisper doesn't include word-level confidence
// if (confidence == null) {
return Theme.of(context).colorScheme.onSurface;
// }
// if (confidence! > thresholdForGreen) {
// return AppConfig.success;
// }
// return AppConfig.warning;
}
factory STTToken.fromJson(Map<String, dynamic> json) {
@ -147,118 +178,3 @@ class STTToken {
]);
}
}
class Transcript {
final String text;
final int confidence;
final List<STTToken> sttTokens;
final String langCode;
final int? wordsPerHr;
Transcript({
required this.text,
required this.confidence,
required this.sttTokens,
required this.langCode,
required this.wordsPerHr,
});
/// Returns the number of words per minute rounded to one decimal place.
double? get wordsPerMinute => wordsPerHr != null ? wordsPerHr! / 60 : null;
factory Transcript.fromJson(Map<String, dynamic> json) => Transcript(
text: json['transcript'],
confidence: json['confidence'] <= 100
? json['confidence']
: json['confidence'] / 100,
sttTokens: (json['stt_tokens'] as List)
.map((e) => STTToken.fromJson(e))
.toList(),
langCode: json['lang_code'],
wordsPerHr: json['words_per_hr'],
);
Map<String, dynamic> toJson() => {
"transcript": text,
"confidence": confidence,
"stt_tokens": sttTokens.map((e) => e.toJson()).toList(),
"lang_code": langCode,
"words_per_hr": wordsPerHr,
};
Color color(BuildContext context) {
if (confidence > thresholdForGreen) {
return AppConfig.success;
}
return AppConfig.warning;
}
}
class SpeechToTextResult {
final List<Transcript> transcripts;
SpeechToTextResult({required this.transcripts});
factory SpeechToTextResult.fromJson(Map<String, dynamic> json) =>
SpeechToTextResult(
transcripts: (json['transcripts'] as List)
.map((e) => Transcript.fromJson(e))
.toList(),
);
Map<String, dynamic> toJson() => {
"transcripts": transcripts.map((e) => e.toJson()).toList(),
};
}
class SpeechToTextModel {
final List<SpeechToTextResult> results;
SpeechToTextModel({
required this.results,
});
Transcript get transcript => results.first.transcripts.first;
String get langCode => results.first.transcripts.first.langCode;
factory SpeechToTextModel.fromJson(Map<String, dynamic> json) {
final results = json['results'] as List;
if (results.isEmpty) {
throw Exception('SpeechToTextModel.fromJson: results is empty');
}
return SpeechToTextModel(
results: (json['results'] as List)
.map((e) => SpeechToTextResult.fromJson(e))
.toList(),
);
}
Map<String, dynamic> toJson() => {
"results": results.map((e) => e.toJson()).toList(),
};
List<OneConstructUse> constructs(
String roomId,
String eventId,
) {
final List<OneConstructUse> constructs = [];
final metadata = ConstructUseMetaData(
roomId: roomId,
eventId: eventId,
timeStamp: DateTime.now(),
);
for (final sstToken in transcript.sttTokens) {
final token = sstToken.token;
if (!token.lemma.saveVocab) continue;
constructs.addAll(
token.allUses(
ConstructUseTypeEnum.pvm,
metadata,
ConstructUseTypeEnum.pvm.pointValue,
),
);
}
return constructs;
}
}

View file

@ -1,133 +0,0 @@
import 'dart:async';
import 'dart:convert';
import 'package:flutter/foundation.dart';
import 'package:http/http.dart';
import 'package:fluffychat/pangea/common/controllers/pangea_controller.dart';
import 'package:fluffychat/pangea/common/utils/error_handler.dart';
import 'package:fluffychat/pangea/events/constants/pangea_event_types.dart';
import 'package:fluffychat/pangea/events/models/representation_content_model.dart';
import 'package:fluffychat/pangea/extensions/pangea_room_extension.dart';
import 'package:fluffychat/pangea/toolbar/models/speech_to_text_models.dart';
import '../../common/config/environment.dart';
import '../../common/network/requests.dart';
import '../../common/network/urls.dart';
// Assuming SpeechToTextRequestModel, SpeechToTextModel and related models are already defined as in your provided code.
class _SpeechToTextCacheItem {
Future<SpeechToTextModel> data;
_SpeechToTextCacheItem({required this.data});
}
class SpeechToTextController {
static final Map<int, _SpeechToTextCacheItem> _cache = {};
late final PangeaController _pangeaController;
Timer? _cacheClearTimer;
SpeechToTextController(this._pangeaController) {
_initializeCacheClearing();
}
void _initializeCacheClearing() {
const duration = Duration(minutes: 2);
_cacheClearTimer = Timer.periodic(duration, (Timer t) => _clearCache());
}
void _clearCache() {
_cache.clear();
}
void dispose() {
_cacheClearTimer?.cancel();
}
Future<SpeechToTextModel> get(
SpeechToTextRequestModel requestModel,
) async {
final int cacheKey = requestModel.hashCode;
if (_cache.containsKey(cacheKey)) {
return _cache[cacheKey]!.data;
} else {
final Future<SpeechToTextModel> response = _fetchResponse(
accessToken: _pangeaController.userController.accessToken,
requestModel: requestModel,
);
_cache[cacheKey] = _SpeechToTextCacheItem(data: response);
return response;
}
}
Future<void> saveSpeechToTextAsRepresentationEvent(
SpeechToTextModel response,
SpeechToTextRequestModel requestModel,
) {
if (requestModel.audioEvent == null) {
debugPrint(
'Audio event is null, case of giving speech to text before message sent, currently not implemented',
);
return Future.value(null);
}
debugPrint('Saving transcript as matrix event');
requestModel.audioEvent?.room
.sendPangeaEvent(
content: PangeaRepresentation(
langCode: response.langCode,
text: response.transcript.text,
originalSent: false,
originalWritten: false,
speechToText: response,
).toJson(),
parentEventId: requestModel.audioEvent!.eventId,
type: PangeaEventTypes.representation,
)
.then(
(_) => debugPrint('Transcript saved as matrix event'),
);
return Future.value(null);
}
Future<SpeechToTextModel> _fetchResponse({
required String accessToken,
required SpeechToTextRequestModel requestModel,
}) async {
final Requests request = Requests(
choreoApiKey: Environment.choreoApiKey,
accessToken: accessToken,
);
final Response res = await request.post(
url: PApiUrls.speechToText,
body: requestModel.toJson(),
);
if (res.statusCode == 200) {
final Map<String, dynamic> json = jsonDecode(utf8.decode(res.bodyBytes));
final response = SpeechToTextModel.fromJson(json);
saveSpeechToTextAsRepresentationEvent(response, requestModel).onError(
(error, stackTrace) => ErrorHandler.logError(
e: error,
s: stackTrace,
data: {
"response": response.toJson(),
"requestModel": requestModel.toJson(),
},
),
);
return response;
} else {
debugPrint('Error converting speech to text: ${res.body}');
throw Exception('Failed to convert speech to text');
}
}
}

View file

@ -9,17 +9,17 @@ import 'package:path_provider/path_provider.dart';
import 'package:fluffychat/pangea/common/utils/async_state.dart';
import 'package:fluffychat/pangea/events/event_wrappers/pangea_message_event.dart';
import 'package:fluffychat/pangea/events/extensions/pangea_event_extension.dart';
import 'package:fluffychat/pangea/toolbar/models/speech_to_text_models.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_response_model.dart';
import 'package:fluffychat/pangea/toolbar/widgets/message_audio_card.dart';
import 'package:fluffychat/pangea/toolbar/widgets/select_mode_buttons.dart';
import 'package:fluffychat/widgets/matrix.dart';
class _TranscriptionLoader extends AsyncLoader<SpeechToTextModel> {
class _TranscriptionLoader extends AsyncLoader<SpeechToTextResponseModel> {
final PangeaMessageEvent messageEvent;
_TranscriptionLoader(this.messageEvent) : super();
@override
Future<SpeechToTextModel> fetch() => messageEvent.getSpeechToText(
Future<SpeechToTextResponseModel> fetch() => messageEvent.getSpeechToText(
MatrixState.pangeaController.languageController.userL1!.langCodeShort,
MatrixState.pangeaController.languageController.userL2!.langCodeShort,
);
@ -127,7 +127,7 @@ class SelectModeController {
ValueNotifier<AsyncState<String>> get translationState =>
_translationLoader.state;
ValueNotifier<AsyncState<SpeechToTextModel>> get transcriptionState =>
ValueNotifier<AsyncState<SpeechToTextResponseModel>> get transcriptionState =>
_transcriptLoader.state;
ValueNotifier<AsyncState<String>> get speechTranslationState =>

View file

@ -2,10 +2,10 @@ import 'package:flutter/material.dart';
import 'package:fluffychat/pangea/events/models/pangea_token_model.dart';
import 'package:fluffychat/pangea/message_token_text/tokens_util.dart';
import 'package:fluffychat/pangea/toolbar/models/speech_to_text_models.dart';
import 'package:fluffychat/pangea/speech_to_text/speech_to_text_response_model.dart';
class SttTranscriptTokens extends StatelessWidget {
final SpeechToTextModel model;
final SpeechToTextResponseModel model;
final TextStyle? style;
final void Function(PangeaToken)? onClick;