fluffychat/lib/pangea/choreographer/igc/text_normalization_util.dart
2025-11-11 15:16:48 -05:00

112 lines
3.4 KiB
Dart
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import 'package:diacritic/diacritic.dart';
import 'package:fluffychat/pangea/common/utils/error_handler.dart';
// The intention of this function is to normalize text for comparison purposes.
// It removes diacritics, punctuation, converts to lowercase, and trims whitespace.
// We would like esta = está, hello! = Hello, etc.
String normalizeString(String input, String languageCode) {
try {
// Step 1: Convert to lowercase (works for all Unicode scripts)
String normalized = input.toLowerCase();
// Step 2: Apply language-specific normalization rules
normalized = _applyLanguageSpecificNormalization(normalized, languageCode);
// Step 3: Replace hyphens and other dash-like characters with spaces
normalized = normalized.replaceAll(
RegExp(r'[-\u2010-\u2015\u2212\uFE58\uFE63\uFF0D]'),
' ',
);
// Step 4: Remove punctuation (including Unicode punctuation)
// This removes ASCII and Unicode punctuation while preserving letters, numbers, and spaces
normalized = normalized.replaceAll(
RegExp(r'[\p{P}\p{S}]', unicode: true),
'',
);
// Step 5: Normalize whitespace (collapse multiple spaces, trim)
return normalized.replaceAll(RegExp(r'\s+'), ' ').trim();
} catch (e, s) {
ErrorHandler.logError(
e: e,
s: s,
data: {'input': input},
);
return input;
}
}
// Apply language-specific normalization rules
String _applyLanguageSpecificNormalization(String text, String languageCode) {
// Apply normalization based on provided language code
switch (languageCode) {
case 'de': // German
String normalized = removeDiacritics(text);
// Handle German ß -> ss conversion
normalized = normalized.replaceAll('ß', 'ss');
return normalized;
case 'da': // Danish
case 'no': // Norwegian
case 'nb': // Norwegian Bokmål
case 'sv': // Swedish
// Some Nordic tests expect characters to be preserved
return text; // Keep æøå intact for now
case 'el': // Greek
// Greek needs accent removal
return _removeGreekAccents(text);
case 'ca': // Catalan
// Catalan expects some characters preserved
return text; // Keep òç etc intact
case 'ar': // Arabic
case 'he': // Hebrew
case 'fa': // Persian/Farsi
case 'ur': // Urdu
case 'ja': // Japanese
case 'ko': // Korean
case 'zh': // Chinese
case 'zh-CN': // Chinese Simplified
case 'zh-TW': // Chinese Traditional
case 'hi': // Hindi
case 'bn': // Bengali
case 'gu': // Gujarati
case 'kn': // Kannada
case 'mr': // Marathi
case 'pa': // Punjabi
case 'ru': // Russian
case 'bg': // Bulgarian
case 'uk': // Ukrainian
case 'sr': // Serbian
case 'am': // Amharic
// Keep original for non-Latin scripts
return text;
default:
// Default Latin script handling
return removeDiacritics(text);
}
}
// Remove Greek accents specifically
String _removeGreekAccents(String text) {
return text
.replaceAll('ά', 'α')
.replaceAll('έ', 'ε')
.replaceAll('ή', 'η')
.replaceAll('ί', 'ι')
.replaceAll('ό', 'ο')
.replaceAll('ύ', 'υ')
.replaceAll('ώ', 'ω')
.replaceAll('Ά', 'Α')
.replaceAll('Έ', 'Ε')
.replaceAll('Ή', 'Η')
.replaceAll('Ί', 'Ι')
.replaceAll('Ό', 'Ο')
.replaceAll('Ύ', 'Υ')
.replaceAll('Ώ', 'Ω');
}