fix: group adjacent punctuation tokens with content tokens to prevent line breaks, added token positions cache (#3713)

2025-08-13 10:50:40 -04:00 · 2025-08-13 10:50:40 -04:00 · fe7e5385e8
commit fe7e5385e8
parent fd617f296f
3 changed files with 104 additions and 9 deletions
--- a/lib/pages/chat/events/html_message.dart
+++ b/lib/pages/chat/events/html_message.dart
@ -14,6 +14,7 @@ import 'package:fluffychat/pages/chat/chat.dart';
 import 'package:fluffychat/pangea/events/event_wrappers/pangea_message_event.dart';
 import 'package:fluffychat/pangea/events/models/pangea_token_model.dart';
 import 'package:fluffychat/pangea/message_token_text/message_token_button.dart';
+import 'package:fluffychat/pangea/message_token_text/token_position_model.dart';
 import 'package:fluffychat/pangea/toolbar/enums/reading_assistance_mode_enum.dart';
 import 'package:fluffychat/pangea/toolbar/utils/token_rendering_util.dart';
 import 'package:fluffychat/pangea/toolbar/widgets/message_selection_overlay.dart';
@ -157,7 +158,7 @@ class HtmlMessage extends StatelessWidget {
      pangeaMessageEvent?.messageDisplayRepresentation?.tokens
          ?.where(
            (t) =>
-                !["PUNCT", "SYM"].contains(t.pos) &&
+                !["SYM"].contains(t.pos) &&
                !t.lemma.text.contains(RegExp(r'[0-9]')) &&
                t.lemma.text.length <= 50,
          )
@ -209,17 +210,25 @@ class HtmlMessage extends StatelessWidget {
    }

    int position = 0;
-    for (final PangeaToken token in tokens ?? []) {
-      final String tokenText = token.text.content;
+    final tokenPositions = tokens != null
+        ? TokensUtil.getAdjacentTokenPositions(event.eventId, tokens!)
+        : [];
+
+    for (final TokenPosition tokenPosition in tokenPositions) {
+      final String tokenSpanText = tokens!
+          .sublist(tokenPosition.startIndex, tokenPosition.endIndex + 1)
+          .map((t) => t.text.content)
+          .join();
+
      final substringIndex = result.indexWhere(
        (string) =>
-            string.contains(tokenText) &&
+            string.contains(tokenSpanText) &&
            !(string.startsWith('<') && string.endsWith('>')),
        position,
      );

      if (substringIndex == -1) continue;
-      int tokenIndex = result[substringIndex].indexOf(tokenText);
+      int tokenIndex = result[substringIndex].indexOf(tokenSpanText);
      if (tokenIndex == -1) continue;

      final beforeSubstring = result[substringIndex].substring(0, tokenIndex);
@ -227,7 +236,7 @@ class HtmlMessage extends StatelessWidget {
        tokenIndex = beforeSubstring.characters.length;
      }

-      final int tokenLength = tokenText.characters.length;
+      final int tokenLength = tokenSpanText.characters.length;
      final before =
          result[substringIndex].characters.take(tokenIndex).toString();
      final after = result[substringIndex]
@ -237,7 +246,7 @@ class HtmlMessage extends StatelessWidget {

      result.replaceRange(substringIndex, substringIndex + 1, [
        if (before.isNotEmpty) before,
-        '<token offset="${token.text.offset}" length="${token.text.length}">$tokenText</token>',
+        '<token offset="${tokenPosition.token!.text.offset}" length="${tokenPosition.token!.text.length}">$tokenSpanText</token>',
        if (after.isNotEmpty) after,
      ]);

--- a/lib/pangea/message_token_text/token_position_model.dart
+++ b/lib/pangea/message_token_text/token_position_model.dart
@ -13,7 +13,82 @@ class TokenPosition {
 }

 class TokensUtil {
-  static List<TokenPosition> getTokenPositions(
+  /// A cache of calculated adjacent token positions
+  static final Map<String, _TokenPositionCacheItem> _tokenPositionCache = {};
+
+  static const Duration _cacheDuration = Duration(minutes: 1);
+
+  static List<TokenPosition>? _getCachedTokenPositions(String eventID) {
+    final cacheItem = _tokenPositionCache[eventID];
+    if (cacheItem == null) return null;
+    if (cacheItem.timestamp.isBefore(DateTime.now().subtract(_cacheDuration))) {
+      _tokenPositionCache.remove(eventID);
+      return null;
+    }
+
+    return cacheItem.positions;
+  }
+
+  static void _setCachedTokenPositions(
+    String eventID,
+    List<TokenPosition> positions,
+  ) {
+    _tokenPositionCache[eventID] = _TokenPositionCacheItem(
+      positions,
+      DateTime.now(),
+    );
+  }
+
+  /// Given a list of tokens, returns a list of positions for tokens and adjacent punctuation
+  /// This list may include gaps in the actual message for non-token elements,
+  /// so should not be used to fully reconstruct the original message.
+  static List<TokenPosition> getAdjacentTokenPositions(
+    String eventID,
+    List<PangeaToken> tokens,
+  ) {
+    final cached = _getCachedTokenPositions(eventID);
+    if (cached != null) {
+      return cached;
+    }
+
+    final List<TokenPosition> positions = [];
+    for (int i = 0; i < tokens.length; i++) {
+      final PangeaToken token = tokens[i];
+
+      PangeaToken? currentToken = token;
+      PangeaToken? nextToken = i < tokens.length - 1 ? tokens[i + 1] : null;
+
+      final isPunct = token.pos == 'PUNCT';
+      final nextIsPunct = nextToken?.pos == 'PUNCT';
+
+      final int startIndex = i;
+      if (isPunct || nextIsPunct) {
+        while (nextToken != null && currentToken?.end == nextToken.start) {
+          i++;
+          currentToken = nextToken;
+          nextToken = i < tokens.length - 1 ? tokens[i + 1] : null;
+        }
+      }
+
+      final adjacentTokens = tokens.sublist(startIndex, i + 1);
+      if (adjacentTokens.every((t) => t.pos == 'PUNCT')) {
+        continue;
+      }
+
+      final position = TokenPosition(
+        token: adjacentTokens.firstWhere((t) => t.pos != 'PUNCT'),
+        startIndex: startIndex,
+        endIndex: i,
+      );
+      positions.add(position);
+    }
+
+    _setCachedTokenPositions(eventID, positions);
+    return positions;
+  }
+
+  /// Given a list of tokens, reconstructs an original message, including gaps for non-token elements.
+  static List<TokenPosition> getGlobalTokenPositions(
    List<PangeaToken> tokens,
  ) {
    final List<TokenPosition> tokenPositions = [];
@ -83,3 +158,13 @@ class TokensUtil {
    return tokenPositions;
  }
 }
+
+class _TokenPositionCacheItem {
+  final List<TokenPosition> positions;
+  final DateTime timestamp;
+
+  _TokenPositionCacheItem(
+    this.positions,
+    this.timestamp,
+  );
+}
--- a/lib/pangea/toolbar/widgets/stt_transcript_tokens.dart
+++ b/lib/pangea/toolbar/widgets/stt_transcript_tokens.dart
@ -37,7 +37,8 @@ class SttTranscriptTokens extends StatelessWidget {
      textScaler: TextScaler.noScaling,
      text: TextSpan(
        style: style ?? DefaultTextStyle.of(context).style,
-        children: TokensUtil.getTokenPositions(tokens).map((tokenPosition) {
+        children:
+            TokensUtil.getGlobalTokenPositions(tokens).map((tokenPosition) {
          final text = messageCharacters
              .skip(tokenPosition.startIndex)
              .take(tokenPosition.endIndex - tokenPosition.startIndex)