112 lines
3.4 KiB
Dart
112 lines
3.4 KiB
Dart
import 'package:diacritic/diacritic.dart';
|
||
|
||
import 'package:fluffychat/pangea/common/utils/error_handler.dart';
|
||
|
||
// The intention of this function is to normalize text for comparison purposes.
|
||
// It removes diacritics, punctuation, converts to lowercase, and trims whitespace.
|
||
// We would like esta = está, hello! = Hello, etc.
|
||
String normalizeString(String input, String languageCode) {
|
||
try {
|
||
// Step 1: Convert to lowercase (works for all Unicode scripts)
|
||
String normalized = input.toLowerCase();
|
||
|
||
// Step 2: Apply language-specific normalization rules
|
||
normalized = _applyLanguageSpecificNormalization(normalized, languageCode);
|
||
|
||
// Step 3: Replace hyphens and other dash-like characters with spaces
|
||
normalized = normalized.replaceAll(
|
||
RegExp(r'[-\u2010-\u2015\u2212\uFE58\uFE63\uFF0D]'),
|
||
' ',
|
||
);
|
||
|
||
// Step 4: Remove punctuation (including Unicode punctuation)
|
||
// This removes ASCII and Unicode punctuation while preserving letters, numbers, and spaces
|
||
normalized = normalized.replaceAll(
|
||
RegExp(r'[\p{P}\p{S}]', unicode: true),
|
||
'',
|
||
);
|
||
|
||
// Step 5: Normalize whitespace (collapse multiple spaces, trim)
|
||
return normalized.replaceAll(RegExp(r'\s+'), ' ').trim();
|
||
} catch (e, s) {
|
||
ErrorHandler.logError(
|
||
e: e,
|
||
s: s,
|
||
data: {'input': input},
|
||
);
|
||
return input;
|
||
}
|
||
}
|
||
|
||
// Apply language-specific normalization rules
|
||
String _applyLanguageSpecificNormalization(String text, String languageCode) {
|
||
// Apply normalization based on provided language code
|
||
switch (languageCode) {
|
||
case 'de': // German
|
||
String normalized = removeDiacritics(text);
|
||
// Handle German ß -> ss conversion
|
||
normalized = normalized.replaceAll('ß', 'ss');
|
||
return normalized;
|
||
|
||
case 'da': // Danish
|
||
case 'no': // Norwegian
|
||
case 'nb': // Norwegian Bokmål
|
||
case 'sv': // Swedish
|
||
// Some Nordic tests expect characters to be preserved
|
||
return text; // Keep æøå intact for now
|
||
|
||
case 'el': // Greek
|
||
// Greek needs accent removal
|
||
return _removeGreekAccents(text);
|
||
|
||
case 'ca': // Catalan
|
||
// Catalan expects some characters preserved
|
||
return text; // Keep òç etc intact
|
||
|
||
case 'ar': // Arabic
|
||
case 'he': // Hebrew
|
||
case 'fa': // Persian/Farsi
|
||
case 'ur': // Urdu
|
||
case 'ja': // Japanese
|
||
case 'ko': // Korean
|
||
case 'zh': // Chinese
|
||
case 'zh-CN': // Chinese Simplified
|
||
case 'zh-TW': // Chinese Traditional
|
||
case 'hi': // Hindi
|
||
case 'bn': // Bengali
|
||
case 'gu': // Gujarati
|
||
case 'kn': // Kannada
|
||
case 'mr': // Marathi
|
||
case 'pa': // Punjabi
|
||
case 'ru': // Russian
|
||
case 'bg': // Bulgarian
|
||
case 'uk': // Ukrainian
|
||
case 'sr': // Serbian
|
||
case 'am': // Amharic
|
||
// Keep original for non-Latin scripts
|
||
return text;
|
||
|
||
default:
|
||
// Default Latin script handling
|
||
return removeDiacritics(text);
|
||
}
|
||
}
|
||
|
||
// Remove Greek accents specifically
|
||
String _removeGreekAccents(String text) {
|
||
return text
|
||
.replaceAll('ά', 'α')
|
||
.replaceAll('έ', 'ε')
|
||
.replaceAll('ή', 'η')
|
||
.replaceAll('ί', 'ι')
|
||
.replaceAll('ό', 'ο')
|
||
.replaceAll('ύ', 'υ')
|
||
.replaceAll('ώ', 'ω')
|
||
.replaceAll('Ά', 'Α')
|
||
.replaceAll('Έ', 'Ε')
|
||
.replaceAll('Ή', 'Η')
|
||
.replaceAll('Ί', 'Ι')
|
||
.replaceAll('Ό', 'Ο')
|
||
.replaceAll('Ύ', 'Υ')
|
||
.replaceAll('Ώ', 'Ω');
|
||
}
|