fluffychat/scripts/find_unused_intl_keys.py

#!/usr/bin/env python3
"""
Script to find unused translation keys in intl_en.arb after line 3243.

This script:
1. Reads intl_en.arb and extracts all translation keys after line 3243
2. Filters out metadata keys (those starting with @)
3. Searches the repository for references to each key
4. Returns a JSON file with unused keys

Usage:
    python3 scripts/find_unused_intl_keys.py

Output:
    scripts/unused_intl_keys.json - JSON file containing the list of unused keys
"""

import json
import os
import re
import subprocess
from pathlib import Path
from typing import Set, List


def extract_keys_after_line(arb_file_path: str, start_line: int = 3243) -> List[str]:
    """
    Extract translation keys from .arb file after a specific line.

    ARB files are JSON files where keys starting with @ are metadata.
    We only want the actual translation keys (non-@ keys), not placeholder
    keys or other nested metadata fields.

    This function extracts only TOP-LEVEL keys that first appear after the
    specified line number. Keys that appear as placeholders are ignored.

    Args:
        arb_file_path: Path to the .arb file
        start_line: Line number to start extracting from (1-indexed)

    Returns:
        List of translation key names
    """
    # Load the entire JSON to get proper structure
    with open(arb_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Read file again to get line numbers for each key
    with open(arb_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    keys = []

    # Extract only top-level keys (not nested keys inside metadata)
    for key in data.keys():
        # Skip metadata keys (those starting with @)
        if key.startswith('@'):
            continue

        # Find the FIRST occurrence of this key as a top-level definition
        # A top-level key appears at the start of a line (after whitespace)
        # with the pattern: "keyName": (not nested inside another object)
        for line_num, line in enumerate(lines, start=1):
            # Match key at the beginning of a line (indentation level 1)
            # This ensures we're matching top-level keys, not nested ones
            if re.match(r'^  "' + re.escape(key) + r'":\s*', line):
                # Only include keys that appear after the specified line
                if line_num > start_line:
                    keys.append(key)
                break

    return keys


def search_key_in_repository(key: str, repo_path: str, exclude_dirs: Set[str]) -> bool:
    """
    Search for a key in the repository using git grep for efficiency.

    Args:
        key: Translation key to search for
        repo_path: Path to the repository root
        exclude_dirs: Set of directory names to exclude from search

    Returns:
        True if the key is found, False otherwise
    """
    try:
        # Use git grep for fast searching, excluding the l10n directory
        # We search for the key name as it would appear in Dart code
        result = subprocess.run(
            ['git', 'grep', '-q', key],
            cwd=repo_path,
            capture_output=True,
            text=True
        )

        # git grep returns 0 if found, 1 if not found
        if result.returncode == 0:
            # Found the key, but we need to verify it's not just in the .arb files
            # Run again with output to check the files
            result_with_output = subprocess.run(
                ['git', 'grep', '-l', key],
                cwd=repo_path,
                capture_output=True,
                text=True
            )

            # Check if any non-.arb files contain the key
            files = result_with_output.stdout.strip().split('\n')
            for file in files:
                if not file.endswith('.arb'):
                    return True

            return False
        else:
            return False

    except subprocess.CalledProcessError:
        return False


def find_unused_keys(arb_file_path: str, repo_path: str, start_line: int = 3243) -> List[str]:
    """
    Find unused translation keys in the repository.

    Args:
        arb_file_path: Path to the .arb file
        repo_path: Path to the repository root
        start_line: Line number to start checking from

    Returns:
        List of unused keys
    """
    # Directories to exclude from search
    exclude_dirs = {'.git', 'build', 'node_modules', '.dart_tool', 'l10n'}

    print(f"Extracting keys from {arb_file_path} after line {start_line}...")
    keys = extract_keys_after_line(arb_file_path, start_line)
    print(f"Found {len(keys)} translation keys to check.\n")

    unused_keys = []
    used_count = 0

    print("Searching repository for key references...")
    for i, key in enumerate(keys, 1):
        # Print progress every 10 keys
        if i % 10 == 0:
            print(f"  Checked {i}/{len(keys)} keys...")

        if search_key_in_repository(key, repo_path, exclude_dirs):
            used_count += 1
        else:
            unused_keys.append(key)

    print(f"\nSearch complete!")
    print(f"Total keys checked: {len(keys)}")
    print(f"Used keys: {used_count}")
    print(f"Unused keys: {len(unused_keys)}")

    return unused_keys


def main():
    """Main function to run the unused key finder."""
    # Get repository root
    repo_path = Path(__file__).parent.parent.absolute()
    arb_file_path = repo_path / 'lib' / 'l10n' / 'intl_en.arb'

    if not arb_file_path.exists():
        print(f"Error: Could not find {arb_file_path}")
        return 1

    # Find unused keys starting from line 3243
    unused_keys = find_unused_keys(str(arb_file_path), str(repo_path), start_line=3243)

    # Print results
    print("\n" + "="*80)
    print("RESULTS")
    print("="*80)

    if unused_keys:
        print(f"\nFound {len(unused_keys)} unused keys (not referenced in any .dart files):")
        print("-" * 80)
        for key in sorted(unused_keys):
            print(f"  - {key}")
    else:
        print("\nNo unused keys found! All keys are referenced in the codebase.")

    # Save results to JSON file
    output_file = repo_path / 'scripts' / 'unused_intl_keys.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({
            'unused_keys': sorted(unused_keys),
            'count': len(unused_keys),
            'source_file': str(arb_file_path),
            'start_line': 3243
        }, f, indent=2, ensure_ascii=False)

    print(f"\nJSON output saved to: {output_file}")

    return 0


if __name__ == '__main__':
    exit(main())