fluffychat/scripts/translate/translate.py

"""
Prerequiresite:
- Ensure you have an up-to-date `needed-translations.txt` file should you wish to translate only the missing translation keys. To generate an updated `needed-translations.txt` file, run:
```
flutter gen-l10n
```

- Ensure you have python `openai` package installed. If not, run:
```
pip install openai
```

- Ensure you have an OpenAI API key set in your environment variable `OPENAI_API_KEY`. If not, you can set it by running:
```
export OPENAI_API_KEY=your-api-key
```

- Ensure vi language translations are up-to-date. This script uses en->vi translations as an example on how to translate so it is necessary. If not, you can run:
```
python scripts/translate.py --lang vi --lang-display-name "Vietnamese" --mode append
```

3 modes:
- append mode (default): translate only the missing translation keys
- upsert mode (not implemented): translate everything (all keys from English)
- update mode (not implemented): specify keys to translate and update their metadata

Usage:
python scripts/translate.py
"""

import argparse
import json
import random
from collections import OrderedDict
from datetime import datetime
from pathlib import Path
from typing import Any

from openai import OpenAI

l10n_dir = Path(__file__).parent.parent / "lib" / "l10n"


def load_all_keys() -> list[str]:
    """
    Load all translation keys from intl_en.arb file.
    """
    path_to_en_translations = l10n_dir / "intl_en.arb"
    if not path_to_en_translations.exists():
        raise FileNotFoundError(
            f"File not found: {path_to_en_translations}. Please run `flutter gen-l10n` to generate the file."
        )
    with open(path_to_en_translations, encoding="utf-8") as f:
        translations = json.loads(f.read())
    return [key for key in translations.keys() if not key.startswith("@")]


def load_needed_translations() -> dict[str, list[str]]:
    path_to_needed_translations = (
        Path(__file__).parent.parent / "needed-translations.txt"
    )
    if not path_to_needed_translations.exists():
        raise FileNotFoundError(
            f"File not found: {path_to_needed_translations}. Please run `flutter gen-l10n` to generate the file."
        )
    with open(path_to_needed_translations, encoding="utf-8") as f:
        needed_translations = json.loads(f.read())

    supported_langs = load_supported_languages()
    all_keys = load_all_keys()
    for lang_code, _ in supported_langs:
        if lang_code not in needed_translations:
            needed_translations[lang_code] = all_keys

    return needed_translations


def load_translations(lang_code: str) -> dict[str, str]:
    path_to_translations = l10n_dir / f"intl_{lang_code}.arb"
    if not path_to_translations.exists():
        translations = {}
    else:
        with open(path_to_translations, encoding="utf-8") as f:
            translations = json.loads(f.read())

    return translations


def save_translations(lang_code: str, translations: dict[str, str]) -> None:

    path_to_translations = l10n_dir / f"intl_{lang_code}.arb"

    translations["@@locale"] = lang_code
    translations["@@last_modified"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

    # Load existing data to preserve order if exists.
    if path_to_translations.exists():
        with open(path_to_translations, "r", encoding="utf-8") as f:
            try:
                existing_data = json.load(f, object_pairs_hook=OrderedDict)
            except json.JSONDecodeError:
                existing_data = OrderedDict()
    else:
        existing_data = OrderedDict()

    # Update existing keys and append new keys (preserving existing order).
    for key, value in translations.items():
        if key in existing_data:
            existing_data[key] = value  # update value; order remains unchanged
        else:
            existing_data[key] = value  # new key appended at the end

    with open(path_to_translations, "w", encoding="utf-8") as f:
        f.write(json.dumps(existing_data, indent=2, ensure_ascii=False))


def reconcile_metadata(
    lang_code: str,
    translation_keys: list[str],
    english_translations_dict: dict[str, Any],
) -> None:
    """
    For each translation key, update its metadata (the key prefixed with '@') by merging
    any existing metadata with computed metadata. For basic translations, if no metadata exists,
    add it; otherwise, leave it as is.
    """
    translations = load_translations(lang_code)

    for key in translation_keys:
        # Skip keys that weren't successfully translated
        if key not in translations:
            continue

        translation = translations[key]
        meta_key = f"@{key}"
        existing_meta = translations.get(meta_key, {})
        assert isinstance(translation, str)

        # Case 1: Basic translations, no placeholders.
        if "{" not in translation:
            if not existing_meta:
                translations[meta_key] = {"type": "String", "placeholders": {}}
            # if metadata exists, leave it as is.

        # Case 2: Translations with placeholders (no pluralization).
        elif (
            "{" in translation
            and "plural," not in translation
            and "other{" not in translation
        ):
            # Compute placeholders.
            computed_placeholders = {}
            for placeholder in translation.split("{")[1:]:
                placeholder_name = placeholder.split("}")[0]
                computed_placeholders[placeholder_name] = {}
                # Obtain placeholder type from english translation or default to {}
                placeholder_type = (
                    english_translations_dict.get(meta_key, {})
                    .get("placeholders", {})
                    .get(placeholder_name, {})
                    .get("type")
                )
                if placeholder_type:
                    computed_placeholders[placeholder_name]["type"] = placeholder_type
            if existing_meta:
                # Merge computed placeholders into existing metadata.
                existing_meta.setdefault("type", "String")
                existing_meta["placeholders"] = computed_placeholders
                translations[meta_key] = existing_meta
            else:
                # Obtain type from english translation or default to "String".
                translation_type = english_translations_dict.get(meta_key, {}).get(
                    "type", "String"
                )
                translations[meta_key] = {
                    "type": translation_type,
                    "placeholders": computed_placeholders,
                }

        # Case 3: Translations with pluralization.
        elif (
            "{" in translation and "plural," in translation and "other{" in translation
        ):
            # Extract placeholders appearing before the plural part.
            prefix = translation.split("plural,")[0].split("{")[1]
            placeholders_list = [
                p.strip() for p in prefix.split(",") if p.strip() != ""
            ]
            computed_placeholders = {ph: {} for ph in placeholders_list}
            for ph in placeholders_list:
                # Obtain placeholder type from english translation or default to {}
                placeholder_type = (
                    english_translations_dict.get(meta_key, {})
                    .get("placeholders", {})
                    .get(placeholder_name, {})
                    .get("type")
                )
                if placeholder_type:
                    computed_placeholders[ph]["type"] = placeholder_type
            if existing_meta:
                existing_meta.setdefault("type", "String")
                existing_meta["placeholders"] = computed_placeholders
                translations[meta_key] = existing_meta
            else:
                # Obtain type from english translation or default to "String".
                translation_type = english_translations_dict.get(meta_key, {}).get(
                    "type", "String"
                )
                translations[meta_key] = {
                    "type": "String",
                    "placeholders": computed_placeholders,
                }

    save_translations(lang_code, translations)


def append_translate(lang_code: str, lang_display_name: str) -> None:
    """
    Translate the needed translations from English to the target language.
    """

    needed_translations = load_needed_translations()
    needed_translations = needed_translations.get(lang_code, [])
    english_translations_dict = load_translations("en")
    vietnamese_translations_dict = load_translations("vi")

    # there are 3 types of translation keys: basic, with placeholders, with pluralization. Read more: TRANSLATORS_GUIDE.md

    basic_translation_keys = [
        k
        for k in english_translations_dict.keys()
        if not k.startswith("@") and not english_translations_dict[k].startswith("{")
    ]
    example_basic_translation_keys = (
        random.sample(basic_translation_keys, 2)
        if len(basic_translation_keys) > 2
        else basic_translation_keys
    )

    placeholder_translation_keys = [
        k
        for k in english_translations_dict.keys()
        if not k.startswith("@")
        and "{" in english_translations_dict[k]
        and "plural," not in english_translations_dict[k]
        and "other{" not in english_translations_dict[k]
    ]
    example_placeholder_translation_keys = (
        random.sample(placeholder_translation_keys, 2)
        if len(placeholder_translation_keys) > 2
        else placeholder_translation_keys
    )
    plural_translation_keys = [
        k
        for k in english_translations_dict.keys()
        if not k.startswith("@")
        and "{" in english_translations_dict[k]
        and "plural," in english_translations_dict[k]
        and "other{" in english_translations_dict[k]
    ]
    example_plural_translation_keys = (
        random.sample(plural_translation_keys, 2)
        if len(plural_translation_keys) > 2
        else plural_translation_keys
    )

    # build example translations
    example_english_translations = {}
    for key in example_basic_translation_keys:
        example_english_translations[key] = english_translations_dict[key]
    for key in example_placeholder_translation_keys:
        example_english_translations[key] = english_translations_dict[key]
    for key in example_plural_translation_keys:
        example_english_translations[key] = english_translations_dict[key]

    example_vietnamese_translations = {}
    for key in example_basic_translation_keys:
        example_vietnamese_translations[key] = vietnamese_translations_dict[key]
    for key in example_placeholder_translation_keys:
        example_vietnamese_translations[key] = vietnamese_translations_dict[key]
    for key in example_plural_translation_keys:
        example_vietnamese_translations[key] = vietnamese_translations_dict[key]

    new_translations = {}
    progress = 0
    for i in range(0, len(needed_translations), 20):
        chunk = needed_translations[i : i + 20]
        translation_requests = {}
        for key in chunk:
            translation_requests[key] = english_translations_dict[key]

        prompt = f"""
        Please translate the following text from English to {lang_display_name}.
        Example:
        req: {json.dumps(example_english_translations, indent=2)}
        res: {json.dumps(example_vietnamese_translations, indent=2)}
        ========================
        req: {json.dumps(translation_requests, indent=2)}
        res:
        """

        client = OpenAI()
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a translator that will only response to translation requests in json format without any additional information.",
                },
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
            model="gpt-4.1-nano",
            temperature=0.0,
        )
        response = chat_completion.choices[0].message.content

        # Try to parse JSON with error handling and retry logic
        max_retries = 3
        retry_count = 0
        _new_translations = None
        messages = [
            {
                "role": "system",
                "content": "You are a translator that will only response to translation requests in json format without any additional information.",
            },
            {
                "role": "user",
                "content": prompt,
            },
            {
                "role": "assistant",
                "content": response,
            },
        ]

        while retry_count < max_retries and _new_translations is None:
            try:
                # Try to clean up common JSON formatting issues first
                cleaned_response = response.strip()
                if cleaned_response.startswith("```json"):
                    cleaned_response = cleaned_response[7:]
                if cleaned_response.endswith("```"):
                    cleaned_response = cleaned_response[:-3]
                cleaned_response = cleaned_response.strip()

                _new_translations = json.loads(cleaned_response)
                break
            except json.JSONDecodeError as e:
                retry_count += 1
                print(f"JSON parsing error (attempt {retry_count}/{max_retries}): {e}")
                print(f"Problematic response: {response[:200]}...")

                if retry_count < max_retries:
                    print("Asking LLM to fix the JSON error...")

                    # Append the error to the conversation and ask LLM to resolve it
                    error_message = f"The previous response caused a JSON parsing error: {str(e)}. The response was: {response}\n\nPlease provide a corrected response that is valid JSON format only, without any markdown formatting or additional text."

                    messages.append(
                        {
                            "role": "user",
                            "content": error_message,
                        }
                    )

                    chat_completion = client.chat.completions.create(
                        messages=messages,
                        model="gpt-4.1-nano",
                        temperature=0.0,
                        max_tokens=8000,
                    )
                    response = chat_completion.choices[0].message.content

                    # Add the new response to the conversation
                    messages.append(
                        {
                            "role": "assistant",
                            "content": response,
                        }
                    )

        if _new_translations is None:
            print(
                f"Failed to parse JSON after {max_retries} attempts. Skipping chunk {i//20 + 1}"
            )
            progress += len(chunk)
            continue

        new_translations.update(_new_translations)
        print(f"Translated {progress + len(chunk)}/{len(needed_translations)}")
        progress += len(chunk)

    # save translations
    current_translations = load_translations(lang_code)
    current_translations.update(new_translations)
    save_translations(lang_code, current_translations)

    # reconcile metadata
    reconcile_metadata(lang_code, needed_translations, english_translations_dict)


def load_supported_languages() -> list[tuple[str, str]]:
    """
    Load the supported languages from the languages.json file.
    """
    with open("languages.json", "r", encoding="utf-8") as f:
        raw_languages = json.load(f)
    languages: list[tuple[str, str]] = []
    for lang in raw_languages:
        assert isinstance(lang, dict), "Each language entry must be a dictionary."
        language_code = lang.get("language_code", None)
        language_name = lang.get("language_name", None)
        assert (
            language_code and language_name
        ), f"Each language must have a 'language_code' and 'language_name'. Found: {lang}"
        languages.append((language_code, language_name))
    return languages


"""
python scripts/translate.py --lang vi --lang-display-name "Vietnamese" --mode append

python scripts/translate.py --translate-all --mode append
"""
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Translate app strings.")

    parser.add_argument(
        "--lang",
        type=str,
        help="Language code to translate to (e.g. 'vi' for Vietnamese, 'en' for English).",
    )

    parser.add_argument(
        "--lang-display-name",
        type=str,
        help="Display name of the language (e.g. 'Vietnamese', 'English').",
    )

    parser.add_argument(
        "--mode",
        type=str,
        choices=["append", "upsert", "update"],
        default="append",
        help="Mode of translation: 'append' to translate only missing keys, 'upsert' to translate all keys, 'update' to specify keys to translate and update their metadata.",
    )

    parser.add_argument(
        "--translate-all",
        action="store_true",
        help="Translate all keys (overrides the mode).",
    )

    args = parser.parse_args()

    translate_all = args.translate_all

    lang_code = args.lang

    lang_display_name = args.lang_display_name

    mode = args.mode

    if not translate_all:
        assert (
            args.lang
        ), "Language code is required if translate all is not set. Use --lang to specify the language code."
        assert (
            args.lang_display_name
        ), "Language display name is required if translate all is not set. Use --lang-display-name to specify the language display name."

    if mode == "append":
        if not translate_all:
            append_translate(
                lang_code=lang_code,
                lang_display_name=lang_display_name,
            )
        else:
            languages = load_supported_languages()
            for i, (lang_code, lang_display_name) in enumerate(languages):
                print(f"Translating {i + 1}/{len(languages)}: {lang_display_name}")
                append_translate(
                    lang_code=lang_code,
                    lang_display_name=lang_display_name,
                )
    else:
        raise NotImplementedError(
            f"Mode '{mode}' is not implemented yet. Please use 'append' mode for now."
        )