fluffychat/scripts/translate/translate_keys.py
2025-11-05 10:38:16 -05:00

496 lines
17 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Translate a specific set of keys in the ARB files.
This script accepts a list of keys and:
1. Finds the value in the source ARB file (intl_en.arb)
2. Finds each instance of the value in the target ARB files (intl_*.arb)
3. Translates the value using OpenAI (reusing translate.py model)
4. Replaces the value in the target ARB files with the translated value
Usage:
python translate_keys.py --keys key1 key2 key3
python translate_keys.py --keys-file keys.txt
WARNING: Has not been tested extensively. Has not been tested with pluralization or
complex placeholders. Verify results before committing.
"""
import argparse
import sys
from pathlib import Path
from typing import Any, List
try:
from openai import OpenAI
except ImportError:
print("Error: openai package not found. Please install it with: pip install openai")
sys.exit(1)
# We'll implement all necessary functions locally to avoid path issues
# Get the client root directory (where this script is run from when called as module)
client_root = (
Path.cwd() if Path.cwd().name == "client" else Path(__file__).parent.parent.parent
)
l10n_dir = client_root / "lib" / "l10n"
def load_translations(lang_code: str) -> dict[str, str]:
"""Load translations for a language code using the correct path."""
import json
path_to_translations = l10n_dir / f"intl_{lang_code}.arb"
if not path_to_translations.exists():
translations = {}
else:
with open(path_to_translations, encoding="utf-8") as f:
translations = json.loads(f.read())
return translations
def load_supported_languages() -> List[tuple[str, str]]:
"""Load the supported languages from the languages.json file with correct path."""
import json
languages_path = client_root / "scripts" / "languages.json"
with open(languages_path, "r", encoding="utf-8") as f:
raw_languages = json.load(f)
languages: List[tuple[str, str]] = []
for lang in raw_languages:
assert isinstance(lang, dict), "Each language entry must be a dictionary."
language_code = lang.get("language_code", None)
language_name = lang.get("language_name", None)
assert (
language_code and language_name
), f"Each language must have a 'language_code' and 'language_name'. Found: {lang}"
languages.append((language_code, language_name))
return languages
def save_translations(lang_code: str, translations: dict[str, str]) -> None:
"""Save translations for a language code using the correct path."""
import json
from collections import OrderedDict
from datetime import datetime
path_to_translations = l10n_dir / f"intl_{lang_code}.arb"
translations["@@locale"] = lang_code
translations["@@last_modified"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
# Load existing data to preserve order if exists.
if path_to_translations.exists():
with open(path_to_translations, "r", encoding="utf-8") as f:
try:
existing_data = json.load(f, object_pairs_hook=OrderedDict)
except json.JSONDecodeError:
existing_data = OrderedDict()
else:
existing_data = OrderedDict()
# Update existing keys and append new keys (preserving existing order).
for key, value in translations.items():
if key in existing_data:
existing_data[key] = value # update value; order remains unchanged
else:
existing_data[key] = value # new key appended at the end
with open(path_to_translations, "w", encoding="utf-8") as f:
f.write(json.dumps(existing_data, indent=2, ensure_ascii=False))
def reconcile_metadata(
lang_code: str,
translation_keys: List[str],
english_translations_dict: dict[str, Any],
) -> None:
"""
For each translation key, update its metadata (the key prefixed with '@') by merging
any existing metadata with computed metadata. For basic translations, if no metadata exists,
add it; otherwise, leave it as is.
"""
translations = load_translations(lang_code)
for key in translation_keys:
# Skip keys that weren't successfully translated
if key not in translations:
continue
translation = translations[key]
meta_key = f"@{key}"
existing_meta = translations.get(meta_key, {})
assert isinstance(translation, str)
# Case 1: Basic translations, no placeholders.
if "{" not in translation:
if not existing_meta:
translations[meta_key] = {"type": "String", "placeholders": {}}
# if metadata exists, leave it as is.
# Case 2: Translations with placeholders (no pluralization).
elif (
"{" in translation
and "plural," not in translation
and "other{" not in translation
):
# Compute placeholders.
computed_placeholders = {}
for placeholder in translation.split("{")[1:]:
placeholder_name = placeholder.split("}")[0]
computed_placeholders[placeholder_name] = {}
# Obtain placeholder type from english translation or default to {}
placeholder_type = (
english_translations_dict.get(meta_key, {})
.get("placeholders", {})
.get(placeholder_name, {})
.get("type")
)
if placeholder_type:
computed_placeholders[placeholder_name]["type"] = placeholder_type
if existing_meta:
# Merge computed placeholders into existing metadata.
existing_meta.setdefault("type", "String")
existing_meta["placeholders"] = computed_placeholders
translations[meta_key] = existing_meta
else:
# Obtain type from english translation or default to "String".
translation_type = english_translations_dict.get(meta_key, {}).get(
"type", "String"
)
translations[meta_key] = {
"type": translation_type,
"placeholders": computed_placeholders,
}
# Case 3: Translations with pluralization.
elif (
"{" in translation and "plural," in translation and "other{" in translation
):
# Extract placeholders appearing before the plural part.
prefix = translation.split("plural,")[0].split("{")[1]
placeholders_list = [
p.strip() for p in prefix.split(",") if p.strip() != ""
]
computed_placeholders = {ph: {} for ph in placeholders_list}
for ph in placeholders_list:
# Obtain placeholder type from english translation or default to {}
placeholder_type = (
english_translations_dict.get(meta_key, {})
.get("placeholders", {})
.get(ph, {})
.get("type")
)
if placeholder_type:
computed_placeholders[ph]["type"] = placeholder_type
if existing_meta:
existing_meta.setdefault("type", "String")
existing_meta["placeholders"] = computed_placeholders
translations[meta_key] = existing_meta
else:
# Obtain type from english translation or default to "String".
translation_type = english_translations_dict.get(meta_key, {}).get(
"type", "String"
)
translations[meta_key] = {
"type": "String",
"placeholders": computed_placeholders,
}
save_translations(lang_code, translations)
def get_all_target_language_files() -> List[Path]:
"""Get all target ARB files (excluding intl_en.arb)."""
arb_files = list(l10n_dir.glob("intl_*.arb"))
# Filter out the English source file
return [f for f in arb_files if f.name != "intl_en.arb"]
def extract_language_code_from_filename(arb_path: Path) -> str:
"""Extract language code from ARB filename (e.g., intl_es.arb -> es)."""
filename = arb_path.stem # Get filename without extension
return filename.replace("intl_", "")
def translate_batch_with_openai(
translation_requests: dict[str, str],
target_language: str,
source_language: str = "English",
) -> dict[str, str]:
"""
Translate a batch of texts using OpenAI API.
Args:
translation_requests: Dictionary of {key: text} to translate
target_language: Target language name (e.g., "Spanish", "French")
source_language: Source language name (default: "English")
Returns:
Dictionary of {key: translated_text}
"""
import json
client = OpenAI()
# Create a batch translation prompt
prompt = f"""
Translate the following JSON object from {source_language} to {target_language}.
Preserve any placeholders (text within curly braces like {{username}}) exactly as they are.
Preserve any special formatting or ICU message format syntax.
Return only a JSON object with the same keys but translated values.
JSON to translate: {json.dumps(translation_requests, indent=2)}
"""
try:
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a professional translator. You translate text accurately while preserving any placeholders and special formatting. Always respond with valid JSON only.",
},
{
"role": "user",
"content": prompt,
},
],
model="gpt-4o-mini",
temperature=0.0,
)
response = chat_completion.choices[0].message.content.strip()
# Clean up common JSON formatting issues
if response.startswith("```json"):
response = response[7:]
if response.endswith("```"):
response = response[:-3]
response = response.strip()
# Parse the JSON response
translated_batch = json.loads(response)
return translated_batch
except json.JSONDecodeError as e:
print(f"JSON parsing error when translating batch to {target_language}: {e}")
print(f"Response was: {response[:200]}...")
# Fallback to original texts if parsing fails
return translation_requests
except (ConnectionError, TimeoutError) as e:
print(f"Network error translating batch to {target_language}: {e}")
return translation_requests # Return original texts if translation fails
except ValueError as e:
print(f"API error translating batch to {target_language}: {e}")
return translation_requests # Return original texts if translation fails
def get_language_display_name(lang_code: str) -> str:
"""Get the display name for a language code."""
supported_languages = load_supported_languages()
for code, display_name in supported_languages:
if code == lang_code:
return display_name
# Fallback mapping for common language codes
fallback_names = {
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"pt_PT": "Portuguese",
"ja": "Japanese",
"ko": "Korean",
"zh": "Chinese",
"zh_Hant": "Traditional Chinese",
"ru": "Russian",
"ar": "Arabic",
"hi": "Hindi",
"nl": "Dutch",
"sv": "Swedish",
"da": "Danish",
"no": "Norwegian",
"nb": "Norwegian",
"fi": "Finnish",
"pl": "Polish",
"cs": "Czech",
"sk": "Slovak",
"hu": "Hungarian",
"ro": "Romanian",
"bg": "Bulgarian",
"hr": "Croatian",
"sl": "Slovenian",
"et": "Estonian",
"lv": "Latvian",
"lt": "Lithuanian",
"el": "Greek",
"tr": "Turkish",
"th": "Thai",
"vi": "Vietnamese",
"id": "Indonesian",
"ms": "Malay",
"fil": "Filipino",
"he": "Hebrew",
"uk": "Ukrainian",
"be": "Belarusian",
"ca": "Catalan",
"gl": "Galician",
"eu": "Basque",
}
return fallback_names.get(lang_code, lang_code.title())
def translate_keys(keys_to_translate: List[str]) -> None:
"""
Translate specific keys in all target ARB files using batch translation.
Args:
keys_to_translate: List of keys to translate
"""
# Load English translations (source)
english_translations = load_translations("en")
# Validate that all keys exist in English
missing_keys = [key for key in keys_to_translate if key not in english_translations]
if missing_keys:
print(
f"Error: The following keys were not found in intl_en.arb: {missing_keys}"
)
return
# Filter out metadata keys
keys_to_translate = [key for key in keys_to_translate if not key.startswith("@")]
# Get all target ARB files
target_files = get_all_target_language_files()
print(f"Found {len(target_files)} target language files")
print(f"Translating {len(keys_to_translate)} keys: {keys_to_translate}")
# Process keys in batches of 10
batch_size = 10
for arb_file in target_files:
lang_code = extract_language_code_from_filename(arb_file)
lang_display_name = get_language_display_name(lang_code)
print(f"\nProcessing {lang_display_name} ({lang_code})...")
# Load current translations for this language
current_translations = load_translations(lang_code)
# Track which keys were actually updated
updated_keys = []
# Process keys in batches
for i in range(0, len(keys_to_translate), batch_size):
batch = keys_to_translate[i : i + batch_size]
# Prepare translation requests for this batch
translation_requests = {}
for key in batch:
english_text = english_translations[key]
translation_requests[key] = english_text
print(
f" Translating batch {i//batch_size + 1} ({len(batch)} keys): {batch}"
)
# Translate the batch
translated_batch = translate_batch_with_openai(
translation_requests, lang_display_name
)
# Update translations and track updated keys
for key in batch:
if key in translated_batch:
translated_text = translated_batch[key]
current_translations[key] = translated_text
updated_keys.append(key)
print(
f" {key}: '{english_translations[key]}' -> '{translated_text}'"
)
else:
print(f" Warning: Key '{key}' not found in translation response")
# Save the updated translations
if updated_keys:
save_translations(lang_code, current_translations)
# Reconcile metadata for updated keys
reconcile_metadata(lang_code, updated_keys, english_translations)
print(f" ✓ Updated {len(updated_keys)} keys in {lang_code}")
else:
print(f" - No keys updated in {lang_code}")
def read_keys_from_file(file_path: Path) -> List[str]:
"""Read keys from a text file (one key per line)."""
if not file_path.exists():
raise FileNotFoundError(f"Keys file not found: {file_path}")
with open(file_path, "r", encoding="utf-8") as f:
keys = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
return keys
def main():
parser = argparse.ArgumentParser(
description="Translate specific keys in ARB files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
pip install openai
python translate_keys.py --keys about accept account
python -m scripts.translate.translate_keys.py --keys-file scripts.translate.keys_to_translate.txt
""",
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--keys", nargs="+", help="List of keys to translate")
group.add_argument(
"--keys-file",
type=Path,
help="File containing keys to translate (one per line)",
)
args = parser.parse_args()
# Get the keys to translate
if args.keys:
keys_to_translate = args.keys
else:
keys_to_translate = read_keys_from_file(args.keys_file)
if not keys_to_translate:
print("Error: No keys to translate")
sys.exit(1)
# Change to the client directory so relative paths work
original_cwd = Path.cwd()
client_dir = Path(__file__).parent.parent.parent
try:
import os
os.chdir(client_dir)
translate_keys(keys_to_translate)
finally:
os.chdir(original_cwd)
print("\n✓ Translation complete!")
if __name__ == "__main__":
main()