fluffychat/scripts/translate/translate_keys.py

#!/usr/bin/env python3
"""
Translate a specific set of keys in the ARB files.

This script accepts a list of keys and:
1. Finds the value in the source ARB file (intl_en.arb)
2. Finds each instance of the value in the target ARB files (intl_*.arb)
3. Translates the value using OpenAI (reusing translate.py model)
4. Replaces the value in the target ARB files with the translated value

Usage:
    python translate_keys.py --keys key1 key2 key3
    python translate_keys.py --keys-file keys.txt


WARNING: Has not been tested extensively. Has not been tested with pluralization or
complex placeholders. Verify results before committing.
"""

import argparse
import sys
from pathlib import Path
from typing import Any, List

try:
    from openai import OpenAI
except ImportError:
    print("Error: openai package not found. Please install it with: pip install openai")
    sys.exit(1)

# We'll implement all necessary functions locally to avoid path issues

# Get the client root directory (where this script is run from when called as module)
client_root = (
    Path.cwd() if Path.cwd().name == "client" else Path(__file__).parent.parent.parent
)
l10n_dir = client_root / "lib" / "l10n"


def load_translations(lang_code: str) -> dict[str, str]:
    """Load translations for a language code using the correct path."""
    import json

    path_to_translations = l10n_dir / f"intl_{lang_code}.arb"
    if not path_to_translations.exists():
        translations = {}
    else:
        with open(path_to_translations, encoding="utf-8") as f:
            translations = json.loads(f.read())

    return translations


def load_supported_languages() -> List[tuple[str, str]]:
    """Load the supported languages from the languages.json file with correct path."""
    import json

    languages_path = client_root / "scripts" / "languages.json"
    with open(languages_path, "r", encoding="utf-8") as f:
        raw_languages = json.load(f)

    languages: List[tuple[str, str]] = []
    for lang in raw_languages:
        assert isinstance(lang, dict), "Each language entry must be a dictionary."
        language_code = lang.get("language_code", None)
        language_name = lang.get("language_name", None)
        assert (
            language_code and language_name
        ), f"Each language must have a 'language_code' and 'language_name'. Found: {lang}"
        languages.append((language_code, language_name))
    return languages


def save_translations(lang_code: str, translations: dict[str, str]) -> None:
    """Save translations for a language code using the correct path."""
    import json
    from collections import OrderedDict
    from datetime import datetime

    path_to_translations = l10n_dir / f"intl_{lang_code}.arb"

    translations["@@locale"] = lang_code
    translations["@@last_modified"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

    # Load existing data to preserve order if exists.
    if path_to_translations.exists():
        with open(path_to_translations, "r", encoding="utf-8") as f:
            try:
                existing_data = json.load(f, object_pairs_hook=OrderedDict)
            except json.JSONDecodeError:
                existing_data = OrderedDict()
    else:
        existing_data = OrderedDict()

    # Update existing keys and append new keys (preserving existing order).
    for key, value in translations.items():
        if key in existing_data:
            existing_data[key] = value  # update value; order remains unchanged
        else:
            existing_data[key] = value  # new key appended at the end

    with open(path_to_translations, "w", encoding="utf-8") as f:
        f.write(json.dumps(existing_data, indent=2, ensure_ascii=False))


def reconcile_metadata(
    lang_code: str,
    translation_keys: List[str],
    english_translations_dict: dict[str, Any],
) -> None:
    """
    For each translation key, update its metadata (the key prefixed with '@') by merging
    any existing metadata with computed metadata. For basic translations, if no metadata exists,
    add it; otherwise, leave it as is.
    """
    translations = load_translations(lang_code)

    for key in translation_keys:
        # Skip keys that weren't successfully translated
        if key not in translations:
            continue

        translation = translations[key]
        meta_key = f"@{key}"
        existing_meta = translations.get(meta_key, {})
        assert isinstance(translation, str)

        # Case 1: Basic translations, no placeholders.
        if "{" not in translation:
            if not existing_meta:
                translations[meta_key] = {"type": "String", "placeholders": {}}
            # if metadata exists, leave it as is.

        # Case 2: Translations with placeholders (no pluralization).
        elif (
            "{" in translation
            and "plural," not in translation
            and "other{" not in translation
        ):
            # Compute placeholders.
            computed_placeholders = {}
            for placeholder in translation.split("{")[1:]:
                placeholder_name = placeholder.split("}")[0]
                computed_placeholders[placeholder_name] = {}
                # Obtain placeholder type from english translation or default to {}
                placeholder_type = (
                    english_translations_dict.get(meta_key, {})
                    .get("placeholders", {})
                    .get(placeholder_name, {})
                    .get("type")
                )
                if placeholder_type:
                    computed_placeholders[placeholder_name]["type"] = placeholder_type
            if existing_meta:
                # Merge computed placeholders into existing metadata.
                existing_meta.setdefault("type", "String")
                existing_meta["placeholders"] = computed_placeholders
                translations[meta_key] = existing_meta
            else:
                # Obtain type from english translation or default to "String".
                translation_type = english_translations_dict.get(meta_key, {}).get(
                    "type", "String"
                )
                translations[meta_key] = {
                    "type": translation_type,
                    "placeholders": computed_placeholders,
                }

        # Case 3: Translations with pluralization.
        elif (
            "{" in translation and "plural," in translation and "other{" in translation
        ):
            # Extract placeholders appearing before the plural part.
            prefix = translation.split("plural,")[0].split("{")[1]
            placeholders_list = [
                p.strip() for p in prefix.split(",") if p.strip() != ""
            ]
            computed_placeholders = {ph: {} for ph in placeholders_list}
            for ph in placeholders_list:
                # Obtain placeholder type from english translation or default to {}
                placeholder_type = (
                    english_translations_dict.get(meta_key, {})
                    .get("placeholders", {})
                    .get(ph, {})
                    .get("type")
                )
                if placeholder_type:
                    computed_placeholders[ph]["type"] = placeholder_type
            if existing_meta:
                existing_meta.setdefault("type", "String")
                existing_meta["placeholders"] = computed_placeholders
                translations[meta_key] = existing_meta
            else:
                # Obtain type from english translation or default to "String".
                translation_type = english_translations_dict.get(meta_key, {}).get(
                    "type", "String"
                )
                translations[meta_key] = {
                    "type": "String",
                    "placeholders": computed_placeholders,
                }

    save_translations(lang_code, translations)


def get_all_target_language_files() -> List[Path]:
    """Get all target ARB files (excluding intl_en.arb)."""
    arb_files = list(l10n_dir.glob("intl_*.arb"))
    # Filter out the English source file
    return [f for f in arb_files if f.name != "intl_en.arb"]


def extract_language_code_from_filename(arb_path: Path) -> str:
    """Extract language code from ARB filename (e.g., intl_es.arb -> es)."""
    filename = arb_path.stem  # Get filename without extension
    return filename.replace("intl_", "")


def translate_batch_with_openai(
    translation_requests: dict[str, str],
    target_language: str,
    source_language: str = "English",
) -> dict[str, str]:
    """
    Translate a batch of texts using OpenAI API.

    Args:
        translation_requests: Dictionary of {key: text} to translate
        target_language: Target language name (e.g., "Spanish", "French")
        source_language: Source language name (default: "English")

    Returns:
        Dictionary of {key: translated_text}
    """
    import json

    client = OpenAI()

    # Create a batch translation prompt
    prompt = f"""
    Translate the following JSON object from {source_language} to {target_language}.
    Preserve any placeholders (text within curly braces like {{username}}) exactly as they are.
    Preserve any special formatting or ICU message format syntax.
    Return only a JSON object with the same keys but translated values.

    JSON to translate: {json.dumps(translation_requests, indent=2)}
    """

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a professional translator. You translate text accurately while preserving any placeholders and special formatting. Always respond with valid JSON only.",
                },
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
            model="gpt-4o-mini",
            temperature=0.0,
        )

        response = chat_completion.choices[0].message.content.strip()

        # Clean up common JSON formatting issues
        if response.startswith("```json"):
            response = response[7:]
        if response.endswith("```"):
            response = response[:-3]
        response = response.strip()

        # Parse the JSON response
        translated_batch = json.loads(response)
        return translated_batch

    except json.JSONDecodeError as e:
        print(f"JSON parsing error when translating batch to {target_language}: {e}")
        print(f"Response was: {response[:200]}...")
        # Fallback to original texts if parsing fails
        return translation_requests
    except (ConnectionError, TimeoutError) as e:
        print(f"Network error translating batch to {target_language}: {e}")
        return translation_requests  # Return original texts if translation fails
    except ValueError as e:
        print(f"API error translating batch to {target_language}: {e}")
        return translation_requests  # Return original texts if translation fails


def get_language_display_name(lang_code: str) -> str:
    """Get the display name for a language code."""
    supported_languages = load_supported_languages()
    for code, display_name in supported_languages:
        if code == lang_code:
            return display_name

    # Fallback mapping for common language codes
    fallback_names = {
        "es": "Spanish",
        "fr": "French",
        "de": "German",
        "it": "Italian",
        "pt": "Portuguese",
        "pt_PT": "Portuguese",
        "ja": "Japanese",
        "ko": "Korean",
        "zh": "Chinese",
        "zh_Hant": "Traditional Chinese",
        "ru": "Russian",
        "ar": "Arabic",
        "hi": "Hindi",
        "nl": "Dutch",
        "sv": "Swedish",
        "da": "Danish",
        "no": "Norwegian",
        "nb": "Norwegian",
        "fi": "Finnish",
        "pl": "Polish",
        "cs": "Czech",
        "sk": "Slovak",
        "hu": "Hungarian",
        "ro": "Romanian",
        "bg": "Bulgarian",
        "hr": "Croatian",
        "sl": "Slovenian",
        "et": "Estonian",
        "lv": "Latvian",
        "lt": "Lithuanian",
        "el": "Greek",
        "tr": "Turkish",
        "th": "Thai",
        "vi": "Vietnamese",
        "id": "Indonesian",
        "ms": "Malay",
        "fil": "Filipino",
        "he": "Hebrew",
        "uk": "Ukrainian",
        "be": "Belarusian",
        "ca": "Catalan",
        "gl": "Galician",
        "eu": "Basque",
    }

    return fallback_names.get(lang_code, lang_code.title())


def translate_keys(keys_to_translate: List[str]) -> None:
    """
    Translate specific keys in all target ARB files using batch translation.

    Args:
        keys_to_translate: List of keys to translate
    """
    # Load English translations (source)
    english_translations = load_translations("en")

    # Validate that all keys exist in English
    missing_keys = [key for key in keys_to_translate if key not in english_translations]
    if missing_keys:
        print(
            f"Error: The following keys were not found in intl_en.arb: {missing_keys}"
        )
        return

    # Filter out metadata keys
    keys_to_translate = [key for key in keys_to_translate if not key.startswith("@")]

    # Get all target ARB files
    target_files = get_all_target_language_files()

    print(f"Found {len(target_files)} target language files")
    print(f"Translating {len(keys_to_translate)} keys: {keys_to_translate}")

    # Process keys in batches of 10
    batch_size = 10

    for arb_file in target_files:
        lang_code = extract_language_code_from_filename(arb_file)
        lang_display_name = get_language_display_name(lang_code)

        print(f"\nProcessing {lang_display_name} ({lang_code})...")

        # Load current translations for this language
        current_translations = load_translations(lang_code)

        # Track which keys were actually updated
        updated_keys = []

        # Process keys in batches
        for i in range(0, len(keys_to_translate), batch_size):
            batch = keys_to_translate[i : i + batch_size]

            # Prepare translation requests for this batch
            translation_requests = {}
            for key in batch:
                english_text = english_translations[key]
                translation_requests[key] = english_text

            print(
                f"  Translating batch {i//batch_size + 1} ({len(batch)} keys): {batch}"
            )

            # Translate the batch
            translated_batch = translate_batch_with_openai(
                translation_requests, lang_display_name
            )

            # Update translations and track updated keys
            for key in batch:
                if key in translated_batch:
                    translated_text = translated_batch[key]
                    current_translations[key] = translated_text
                    updated_keys.append(key)
                    print(
                        f"    {key}: '{english_translations[key]}' -> '{translated_text}'"
                    )
                else:
                    print(f"    Warning: Key '{key}' not found in translation response")

        # Save the updated translations
        if updated_keys:
            save_translations(lang_code, current_translations)

            # Reconcile metadata for updated keys
            reconcile_metadata(lang_code, updated_keys, english_translations)

            print(f"  ✓ Updated {len(updated_keys)} keys in {lang_code}")
        else:
            print(f"  - No keys updated in {lang_code}")


def read_keys_from_file(file_path: Path) -> List[str]:
    """Read keys from a text file (one key per line)."""
    if not file_path.exists():
        raise FileNotFoundError(f"Keys file not found: {file_path}")

    with open(file_path, "r", encoding="utf-8") as f:
        keys = [
            line.strip()
            for line in f
            if line.strip() and not line.strip().startswith("#")
        ]

    return keys


def main():
    parser = argparse.ArgumentParser(
        description="Translate specific keys in ARB files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
    Examples:
  pip install openai
  python translate_keys.py --keys about accept account
  python -m scripts.translate.translate_keys.py --keys-file scripts.translate.keys_to_translate.txt
        """,
    )

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--keys", nargs="+", help="List of keys to translate")
    group.add_argument(
        "--keys-file",
        type=Path,
        help="File containing keys to translate (one per line)",
    )

    args = parser.parse_args()

    # Get the keys to translate
    if args.keys:
        keys_to_translate = args.keys
    else:
        keys_to_translate = read_keys_from_file(args.keys_file)

    if not keys_to_translate:
        print("Error: No keys to translate")
        sys.exit(1)

    # Change to the client directory so relative paths work
    original_cwd = Path.cwd()
    client_dir = Path(__file__).parent.parent.parent

    try:
        import os

        os.chdir(client_dir)
        translate_keys(keys_to_translate)
    finally:
        os.chdir(original_cwd)

    print("\n✓ Translation complete!")


if __name__ == "__main__":
    main()