From fd65a26c8154d7119a25b06eaf135b314eb58fb2 Mon Sep 17 00:00:00 2001 From: mark Date: Sat, 31 Jan 2026 22:13:37 +0200 Subject: [PATCH 1/9] feat: add translation script and GitHub Action for mobile i18n autofill - Create `translate.py` to automate Android string resource translations using the Google Gemini API, featuring placeholder/markup preservation, change detection via snapshots, and batch processing. - Add `.github/workflows/mobile-i18n-autofill-pr.yml` to trigger translation autofill on pull requests to the development branch. - Support translation for 24 locales using the `gemma-3-27b-it` model. - Implement automated validation of translated Android resources and direct commits of updates to PR branches. --- .github/workflows/mobile-i18n-autofill-pr.yml | 89 + translate.py | 1673 +++++++++++++++++ 2 files changed, 1762 insertions(+) create mode 100644 .github/workflows/mobile-i18n-autofill-pr.yml create mode 100644 translate.py diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml new file mode 100644 index 00000000..c16e55b8 --- /dev/null +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -0,0 +1,89 @@ +name: Mobile i18n Autofill (bot PR) + +on: + pull_request: + branches: [dev] + paths: + - 'cmp-android/**' + - 'feature/**' + - '.github/workflows/mobile-i18n-autofill-pr.yml' + - 'translate.py' + +permissions: + contents: write + pull-requests: write + +jobs: + i18n-autofill: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + # Checks out the actual branch of the PR + ref: ${{ github.event.pull_request.head.ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up JDK 21 + uses: actions/setup-java@v4 + with: + java-version: '21' + distribution: 'temurin' + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Cache Gradle + uses: actions/cache@v4 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }} + restore-keys: | + ${{ runner.os }}-gradle- + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install google-genai lxml + + - name: Run translation autofill + env: + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + run: | + python translate.py \ + --mode apply \ + --locales "ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur" \ + --model "gemma-3-27b-it" \ + --batch-size 15 \ + --request-delay 2 + + - name: Validate Android resources compile + run: | + ./gradlew :cmp-android:processDemoDebugResources + + - name: Check for changes + id: check_changes + run: | + if git status --porcelain cmp-android/ feature/ | grep -q .; then + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + - name: Commit and Push changes + if: steps.check_changes.outputs.has_changes == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + git add cmp-android/ feature/ + git commit -m "chore: auto-generate mobile i18n translations" + + # Pushes directly back to the PR head branch + git push origin HEAD:${{ github.event.pull_request.head.ref }} \ No newline at end of file diff --git a/translate.py b/translate.py new file mode 100644 index 00000000..f91fbb18 --- /dev/null +++ b/translate.py @@ -0,0 +1,1673 @@ +#!/usr/bin/env python3 +""" +Android String Resource Translator + +Production-ready translation of Android string resources using Google Gemini API. + +Features: +- Comment preservation (copies comments from source file exactly) +- Spacing preservation (maintains blank lines and structure from source) +- Placeholder preservation with validation (%s, %1$s, etc.) +- Markup tag preservation with validation (, , etc.) +- Token order validation (not just presence) +- Source attribute propagation (formatted, product, tools:*) +- Conditional placeholder handling for formatted="false" strings +- Whitespace preservation (no stripping of source text) +- HTML entity conversion (case-insensitive) +- Android special character escaping +- Proper xliff namespace handling for AAPT2 compatibility +- Batch translation with individual fallback +- Better 503/overload error handling +- Change detection via snapshot tracking (re-translates modified strings) +- Comprehensive validation and error handling + +Usage: + python translate.py --mode check --locales es,de,fr + python translate.py --mode apply --locales es,de,fr + python translate.py --mode apply --locales ar --model gemma-3-27b-it --batch-size 15 + +Environment: + GEMINI_API_KEY=your_api_key_here +""" + +from __future__ import annotations + +import argparse +import copy +import hashlib +import json +import logging +import os +import re +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, FrozenSet, List, Optional, Set, Tuple + +from lxml import etree as ET +from google import genai +from google.genai import types + +# ============================================================================ +# Logging Configuration +# ============================================================================ + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger(__name__) + +# ============================================================================ +# Constants & Namespaces +# ============================================================================ + +XML_PARSER = ET.XMLParser( + remove_blank_text=False, + remove_comments=False, + strip_cdata=False, +) + +XLIFF_NAMESPACE = "urn:oasis:names:tc:xliff:document:1.2" +TOOLS_NAMESPACE = "http://schemas.android.com/tools" + +ET.register_namespace("xliff", XLIFF_NAMESPACE) +ET.register_namespace("tools", TOOLS_NAMESPACE) + +DEFAULT_EXCLUDE_DIRS: FrozenSet[str] = frozenset({ + ".git", ".gradle", "build", ".idea", "node_modules", + "__pycache__", "venv", ".venv", ".svn", ".hg", "target", + "bin", "obj", ".dart_tool", ".pub-cache", +}) + +PROPAGATE_ATTRIBUTES: FrozenSet[str] = frozenset({ + "formatted", + "product", +}) + +ALLOWED_TAGS: FrozenSet[str] = frozenset({ + "b", "i", "u", "s", "strike", "del", "ins", + "strong", "em", "cite", "dfn", "code", "samp", "kbd", "var", + "big", "small", "sup", "sub", "tt", + "a", "font", "annotation", "span", + "xliff:g", "g", +}) + +SNAPSHOT_DIR_NAME = ".translation_snapshots" + +# ============================================================================ +# Regex Patterns +# ============================================================================ + +PLACEHOLDER_PATTERNS = [ + r"%%", + r"%n", + r"%(?:\d+\$)?[-+# 0,(]*\d*(?:\.\d+)?[sdbBhHoOxXeEfgGaAcC]", + r"%(?:\d+\$)?[-+# 0,(]*\d*(?:\.\d+)?t[HIklMSLNpzZsQBbhAaCYyjmdeRTrDFc]", +] +PLACEHOLDER_RE = re.compile("|".join(PLACEHOLDER_PATTERNS)) + +XLIFF_TAG_RE = re.compile( + r"]*>.*?|]*/\s*>", + re.DOTALL | re.IGNORECASE, +) + +MARKUP_TAG_RE = re.compile(r"]*>") +MARKUP_PATTERN = re.compile(r"]*>") +TAG_NAME_PATTERN = re.compile(r" bool: + return self.attributes.get("formatted", "true").lower() != "false" + + def get_propagated_attributes(self) -> Dict[str, str]: + """Get attributes to propagate to translation (name, formatted, product, tools:*).""" + result = {"name": self.key} + for attr in PROPAGATE_ATTRIBUTES: + if attr in self.attributes: + result[attr] = self.attributes[attr] + for key, value in self.attributes.items(): + if key.startswith(f"{{{TOOLS_NAMESPACE}}}") or key.startswith("tools:"): + result[key] = value + return result + + +@dataclass +class FrozenText: + """Text with placeholders and markup tags replaced by tokens.""" + original: str + frozen: str + placeholders: List[str] + tags: List[str] + + def unfreeze(self, translated_frozen: str) -> str: + result = translated_frozen + for i, ph in enumerate(self.placeholders): + result = result.replace(f"[[PH_{i}]]", ph) + for i, tag in enumerate(self.tags): + result = result.replace(f"[[TAG_{i}]]", tag) + return result + + def validate(self, translated_frozen: str) -> Tuple[bool, List[str]]: + errors: List[str] = [] + for i, ph in enumerate(self.placeholders): + token = f"[[PH_{i}]]" + if token not in translated_frozen: + errors.append(f"Missing placeholder {token} (was: {ph})") + for i, tag in enumerate(self.tags): + token = f"[[TAG_{i}]]" + if token not in translated_frozen: + tag_preview = tag[:40] + "..." if len(tag) > 40 else tag + errors.append(f"Missing tag {token} (was: {tag_preview})") + expected_tokens = TOKEN_SEQUENCE_RE.findall(self.frozen) + actual_tokens = TOKEN_SEQUENCE_RE.findall(translated_frozen) + if expected_tokens != actual_tokens and not errors: + errors.append(f"Token order changed: expected {expected_tokens}, got {actual_tokens}") + return len(errors) == 0, errors + + @property + def has_tokens(self) -> bool: + return bool(self.placeholders or self.tags) + + @property + def token_count(self) -> int: + return len(self.placeholders) + len(self.tags) + + +@dataclass +class LocaleResult: + """Translation results for a single locale and source file.""" + locale: str + source_path: Path + target_path: Path + total_source: int = 0 + already_translated: int = 0 + newly_translated: int = 0 + changed_count: int = 0 + failed: int = 0 + errors: List[str] = field(default_factory=list) + + @property + def missing_before(self) -> int: + return self.total_source - self.already_translated + self.changed_count + + +@dataclass +class ProcessingResult: + """Overall processing results across all files and locales.""" + locale_results: List[LocaleResult] = field(default_factory=list) + + @property + def total_missing_before(self) -> int: + return sum(r.missing_before for r in self.locale_results) + + @property + def total_translated(self) -> int: + return sum(r.newly_translated for r in self.locale_results) + + @property + def total_changed(self) -> int: + return sum(r.changed_count for r in self.locale_results) + + @property + def total_failed(self) -> int: + return sum(r.failed for r in self.locale_results) + + @property + def has_missing(self) -> bool: + return (self.total_missing_before - self.total_translated) > 0 + + @property + def has_failures(self) -> bool: + return self.total_failed > 0 + + +# ============================================================================ +# Snapshot Tracking Functions (Minimal Hash-Only) +# ============================================================================ + + +def content_hash(text: str) -> str: + """Generate short hash of string content for change detection.""" + return hashlib.sha256(text.encode('utf-8')).hexdigest()[:12] + + +def get_snapshot_path(source_xml: Path, repo_root: Path) -> Path: + """Get snapshot file path for a source file.""" + try: + relative = source_xml.relative_to(repo_root) + safe_name = str(relative).replace("/", "_").replace("\\", "_").replace(":", "_") + except ValueError: + safe_name = source_xml.name + return repo_root / SNAPSHOT_DIR_NAME / f"{safe_name}.json" + + +def load_snapshot(snapshot_path: Path) -> Dict[str, str]: + """Load snapshot: key -> hash mapping.""" + if not snapshot_path.exists(): + return {} + try: + content = snapshot_path.read_text(encoding='utf-8') + data = json.loads(content) + if isinstance(data, dict): + return {str(k): str(v) for k, v in data.items()} + return {} + except (json.JSONDecodeError, IOError, OSError) as e: + logger.warning(f"Failed to load snapshot {snapshot_path}: {e}") + return {} + + +def save_snapshot(snapshot_path: Path, entries: List[StringEntry]) -> None: + """Save minimal snapshot: key -> hash only.""" + try: + snapshot_path.parent.mkdir(parents=True, exist_ok=True) + + data = {entry.key: content_hash(entry.text) for entry in entries} + + # Compact JSON format to minimize size + snapshot_path.write_text( + json.dumps(data, sort_keys=True, separators=(',', ':')), + encoding='utf-8' + ) + logger.debug(f"Saved snapshot: {snapshot_path}") + except (IOError, OSError) as e: + logger.warning(f"Failed to save snapshot {snapshot_path}: {e}") + + +def find_changed_entries( + source_entries: List[StringEntry], + snapshot: Dict[str, str], + existing_keys: Set[str], +) -> List[StringEntry]: + """ + Find entries where source text changed since last translation. + + Only returns entries that: + 1. Exist in the snapshot (were previously processed) + 2. Have different content hash now + 3. Already exist in target file (need re-translation, not new) + """ + changed: List[StringEntry] = [] + + for entry in source_entries: + # Skip if not in snapshot (new key, handled separately) + if entry.key not in snapshot: + continue + + # Skip if not already translated + if entry.key not in existing_keys: + continue + + stored_hash = snapshot[entry.key] + current_hash = content_hash(entry.text) + + if stored_hash != current_hash: + changed.append(entry) + logger.debug( + f" Detected change in '{entry.key}': " + f"hash {stored_hash[:8]}... → {current_hash[:8]}..." + ) + + return changed + + +def _snapshot_needs_update(snapshot: Dict[str, str], source_entries: List[StringEntry]) -> bool: + """Check if snapshot needs to be updated based on source changes.""" + # No snapshot exists + if not snapshot: + return True + + current_keys: Set[str] = set() + for entry in source_entries: + current_keys.add(entry.key) + current_hash = content_hash(entry.text) + stored_hash = snapshot.get(entry.key) + + # New key or changed content + if stored_hash != current_hash: + return True + + # Check for removed keys + snapshot_keys = set(snapshot.keys()) + if snapshot_keys - current_keys: + return True + + return False + + +# ============================================================================ +# Text Freezing Functions +# ============================================================================ + + +def freeze_text(text: str, freeze_placeholders: bool = True) -> FrozenText: + """Replace placeholders and markup tags with tokens for safe translation.""" + frozen = text + placeholders: List[str] = [] + tags: List[str] = [] + + def freeze_xliff(match: re.Match) -> str: + tags.append(match.group(0)) + return f"[[TAG_{len(tags) - 1}]]" + + frozen = XLIFF_TAG_RE.sub(freeze_xliff, frozen) + + def freeze_tag(match: re.Match) -> str: + tags.append(match.group(0)) + return f"[[TAG_{len(tags) - 1}]]" + + frozen = MARKUP_TAG_RE.sub(freeze_tag, frozen) + + if freeze_placeholders: + def freeze_ph(match: re.Match) -> str: + placeholders.append(match.group(0)) + return f"[[PH_{len(placeholders) - 1}]]" + frozen = PLACEHOLDER_RE.sub(freeze_ph, frozen) + + return FrozenText(original=text, frozen=frozen, placeholders=placeholders, tags=tags) + + +# ============================================================================ +# Text Sanitization Functions +# ============================================================================ + + +def convert_html_entities_to_numeric(text: str) -> str: + """Convert HTML named entities to XML numeric entities.""" + def replace_entity(match: re.Match) -> str: + name = match.group(1).lower() + return HTML_ENTITY_TO_NUMERIC.get(name, match.group(0)) + return HTML_ENTITY_PATTERN.sub(replace_entity, text) + + +def fix_bare_ampersands(text: str) -> str: + """Replace bare ampersands with & for XML validity.""" + return BARE_AMPERSAND_PATTERN.sub("&", text) + + +def sanitize_for_xml_parse(text: str) -> str: + """Prepare text for XML parsing.""" + result = convert_html_entities_to_numeric(text) + return fix_bare_ampersands(result) + + +def escape_android_string(text: str) -> str: + """Escape Android special characters in string resources.""" + if not text: + return text + result: List[str] = [] + i = 0 + length = len(text) + while i < length: + char = text[i] + if char == '\\' and i + 1 < length: + next_char = text[i + 1] + if next_char in ("'", '"', '\\', 'n', 't', 'r', '@', '?'): + result.append(char) + result.append(next_char) + i += 2 + continue + if next_char == 'u' and i + 5 <= length: + hex_chars = text[i + 2:i + 6] + if len(hex_chars) == 4 and all(c in '0123456789abcdefABCDEF' for c in hex_chars): + result.append(text[i:i + 6]) + i += 6 + continue + if char == "'": + result.append("\\'") + elif char == '@' and i == 0: + result.append('\\@') + elif char == '?' and i == 0: + result.append('\\?') + else: + result.append(char) + i += 1 + return ''.join(result) + + +def escape_android_text_nodes(element: ET._Element) -> None: + """Recursively escape Android special characters in text and tail content.""" + if element.text: + element.text = escape_android_string(element.text) + for child in element: + if not callable(child.tag): + escape_android_text_nodes(child) + if child.tail: + child.tail = escape_android_string(child.tail) + + +def validate_allowed_tags(value: str) -> Tuple[bool, List[str]]: + """Check if all markup tags in value are in the allowlist.""" + if not MARKUP_PATTERN.search(value): + return True, [] + found = set(TAG_NAME_PATTERN.findall(value)) + unknown = [t for t in found if t.lower() not in ALLOWED_TAGS] + return len(unknown) == 0, unknown + + +# ============================================================================ +# XML Helper Functions +# ============================================================================ + + +def is_comment(elem) -> bool: + """Check if element is a comment (lxml comments have callable tag).""" + return callable(elem.tag) + + +def get_comment_text(elem) -> str: + """Get the text content of a comment element.""" + if is_comment(elem): + return elem.text or "" + return "" + + +def get_element_full_text(elem: ET._Element) -> str: + """Get full text content including child elements as markup.""" + parts: List[str] = [] + if elem.text: + parts.append(elem.text) + for child in elem: + if not is_comment(child): + parts.append(ET.tostring(child, encoding="unicode")) + if child.tail: + parts.append(child.tail) + return "".join(parts) + + +# ============================================================================ +# XML Reading Functions +# ============================================================================ + + +def read_source_strings(source_xml: Path) -> List[StringEntry]: + """Read translatable strings from source XML, preserving attributes.""" + tree = ET.parse(str(source_xml), parser=XML_PARSER) + root = tree.getroot() + entries: List[StringEntry] = [] + + for node in root.iter("string"): + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": + continue + raw_text = get_element_full_text(node) + if not raw_text or not raw_text.strip(): + continue + preserved: Dict[str, str] = {} + for attr_key, attr_val in node.attrib.items(): + if attr_key in ("name", "translatable"): + continue + if attr_key in PROPAGATE_ATTRIBUTES: + preserved[attr_key] = attr_val + elif attr_key.startswith(f"{{{TOOLS_NAMESPACE}}}"): + preserved[attr_key] = attr_val + entries.append(StringEntry(key=name, text=raw_text, attributes=preserved)) + + return entries + + +def read_existing_keys(target_xml: Path) -> Set[str]: + """Read existing string keys from target file.""" + if not target_xml.exists(): + return set() + try: + tree = ET.parse(str(target_xml), parser=XML_PARSER) + root = tree.getroot() + return set(root.xpath("./string/@name")) + except ET.XMLSyntaxError: + return set() + + +# ============================================================================ +# XML Writing Functions +# ============================================================================ + + +def set_mixed_string_value( + node: ET._Element, + value: str, + key: Optional[str] = None, + warn_unknown_tags: bool = True, +) -> None: + """Set string node value, preserving embedded markup.""" + node.text = None + for child in list(node): + node.remove(child) + + key_prefix = f"[{key}] " if key else "" + + if warn_unknown_tags and MARKUP_PATTERN.search(value): + is_valid, unknown = validate_allowed_tags(value) + if not is_valid: + logger.warning(f"{key_prefix}Unknown tags (may not render): {unknown}") + + if not MARKUP_PATTERN.search(value): + converted = convert_html_entities_to_numeric(value) + node.text = escape_android_string(converted) + return + + sanitized = sanitize_for_xml_parse(value) + wrapped = f"<_root xmlns:xliff='{XLIFF_NAMESPACE}'>{sanitized}" + + try: + fragment = ET.fromstring(wrapped.encode('utf-8')) + except ET.XMLSyntaxError as e: + logger.warning(f"{key_prefix}XML parse failed, using plain text: {e}") + fallback = convert_html_entities_to_numeric(value) + node.text = escape_android_string(fallback) + return + + node.text = fragment.text + for child in list(fragment): + fragment.remove(child) + node.append(child) + + escape_android_text_nodes(node) + + +def write_translations( + target_xml: Path, + translations: Dict[str, str], + source_entries: List[StringEntry], + source_xml: Path, + validate: bool = True, + warn_unknown_tags: bool = True, +) -> int: + """ + Write translations to target XML, preserving EXACT source structure. + + For NEW files: Deep copies source, replaces content, removes untranslated strings + For EXISTING files: Merges new translations while preserving structure + """ + target_xml.parent.mkdir(parents=True, exist_ok=True) + + # Parse source with all whitespace and comments preserved + source_tree = ET.parse(str(source_xml), parser=XML_PARSER) + source_root = source_tree.getroot() + + # Check if target already exists + if target_xml.exists(): + try: + existing_tree = ET.parse(str(target_xml), parser=XML_PARSER) + existing_root = existing_tree.getroot() + existing_keys = set(existing_root.xpath("./string/@name")) + + return _merge_into_existing( + target_xml, existing_root, translations, source_entries, + source_root, existing_keys, validate, warn_unknown_tags + ) + except ET.XMLSyntaxError as e: + logger.warning(f"Corrupted '{target_xml}', recreating: {e}") + + # Create new file from source structure + return _create_from_source( + target_xml, translations, source_entries, + source_root, validate, warn_unknown_tags + ) + + +def _create_from_source( + target_xml: Path, + translations: Dict[str, str], + source_entries: List[StringEntry], + source_root: ET._Element, + validate: bool, + warn_unknown_tags: bool, +) -> int: + """ + Create new translation file by deep copying source and replacing text. + Preserves all comments, whitespace, and exact ordering. + """ + # Deep copy preserves everything + root = copy.deepcopy(source_root) + + # Build set of keys that have translations + translated_keys: Set[str] = set(translations.keys()) + + # Build sections: list of (comment_elements, string_elements) + # Each section starts with zero or more comments followed by strings + sections: List[Tuple[List[ET._Element], List[ET._Element]]] = [] + current_comments: List[ET._Element] = [] + current_strings: List[ET._Element] = [] + + for elem in list(root): + if is_comment(elem): + if current_strings: + # Save previous section and start new one + sections.append((current_comments, current_strings)) + current_comments = [] + current_strings = [] + current_comments.append(elem) + elif elem.tag == "string": + current_strings.append(elem) + + if current_comments or current_strings: + sections.append((current_comments, current_strings)) + + written = 0 + elements_to_remove: List[ET._Element] = [] + + # Process each section + for comments, strings in sections: + # Check if this section has any translated strings + section_has_translation = False + for string_elem in strings: + name = string_elem.get("name") + if name and name in translated_keys: + section_has_translation = True + break + # Non-translatable strings don't count + if string_elem.get("translatable", "true").lower() == "false": + section_has_translation = True # Keep non-translatable + break + + if not section_has_translation: + # Remove entire section (comments + strings) + elements_to_remove.extend(comments) + elements_to_remove.extend(strings) + continue + + # Process strings in this section + for string_elem in strings: + name = string_elem.get("name") + + if not name: + elements_to_remove.append(string_elem) + continue + + # Keep non-translatable strings unchanged + if string_elem.get("translatable", "true").lower() == "false": + continue + + if name in translations: + # Update with translation + value = translations[name] + + # Clear content + string_elem.text = None + for child in list(string_elem): + string_elem.remove(child) + + set_mixed_string_value(string_elem, value, key=name, warn_unknown_tags=warn_unknown_tags) + written += 1 + else: + # No translation - remove this string + elements_to_remove.append(string_elem) + + # Remove elements while preserving whitespace + for elem in elements_to_remove: + _remove_element_preserve_whitespace(root, elem) + + # Clean up redundant namespace declarations + ET.cleanup_namespaces(root) + + # Write file + tree = ET.ElementTree(root) + tree.write( + str(target_xml), + encoding="utf-8", + xml_declaration=True, + pretty_print=False, + ) + + # Post-process to fix any xliff namespace prefix issues (ns0, ns1 -> xliff) + _fix_xliff_namespaces_in_file(target_xml) + + if validate: + try: + ET.parse(str(target_xml), parser=XML_PARSER) + except ET.XMLSyntaxError as e: + raise XmlWriteError(f"Written file is malformed: {target_xml}: {e}") + + return written + + +def _fix_xliff_namespaces_in_file(target_xml: Path) -> None: + """ + Post-process the written XML file to fix xliff namespace issues and formatting. + + lxml may generate auto-prefixed namespaces (ns0, ns1, etc.) instead of + using the proper 'xliff' prefix. This function: + - Fixes XML declaration to use double quotes and lowercase encoding + - Adds copyright header if missing + - Replaces ns#: prefixes with xliff: for XLIFF namespace + - Removes inline xmlns:ns# declarations for XLIFF + - Ensures xliff namespace is declared at root level + """ + content = target_xml.read_text(encoding='utf-8') + original_content = content + + # Fix XML declaration: single quotes to double quotes, uppercase to lowercase + content = re.sub( + r"<\?xml version='1\.0' encoding='UTF-8'\?>", + '', + content + ) + + # Copyright header template + copyright_header = '''''' + + # Add copyright header if missing (check for "Copyright" in a comment) + if '''' # Add copyright header if missing (check for "Copyright" in a comment) From e6093fc23184aec6e87252c1ba7877586f63ccec Mon Sep 17 00:00:00 2001 From: Mark Rizkalla <46606022+markrizkalla@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:13:09 +0200 Subject: [PATCH 3/9] Replace GITHUB_TOKEN with PAT_TOKEN in workflow --- .github/workflows/mobile-i18n-autofill-pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml index c16e55b8..8d9018c4 100644 --- a/.github/workflows/mobile-i18n-autofill-pr.yml +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -24,7 +24,7 @@ jobs: fetch-depth: 0 # Checks out the actual branch of the PR ref: ${{ github.event.pull_request.head.ref }} - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ secrets.PAT_TOKEN }} - name: Set up JDK 21 uses: actions/setup-java@v4 @@ -86,4 +86,4 @@ jobs: git commit -m "chore: auto-generate mobile i18n translations" # Pushes directly back to the PR head branch - git push origin HEAD:${{ github.event.pull_request.head.ref }} \ No newline at end of file + git push origin HEAD:${{ github.event.pull_request.head.ref }} From e6e8a0000968b415ccaf4f4f4b0cd59304221c31 Mon Sep 17 00:00:00 2001 From: Mark Rizkalla <46606022+markrizkalla@users.noreply.github.com> Date: Tue, 3 Feb 2026 13:23:00 +0200 Subject: [PATCH 4/9] Modify mobile i18n autofill workflow Updated workflow to use pull_request_target for better access control and added checks for changes before committing. --- .github/workflows/mobile-i18n-autofill-pr.yml | 46 +++++++------------ 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml index 8d9018c4..069f2cf8 100644 --- a/.github/workflows/mobile-i18n-autofill-pr.yml +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -1,7 +1,8 @@ name: Mobile i18n Autofill (bot PR) on: - pull_request: + # Changed from pull_request to pull_request_target for write access + pull_request_target: branches: [dev] paths: - 'cmp-android/**' @@ -16,15 +17,15 @@ permissions: jobs: i18n-autofill: runs-on: ubuntu-latest - + # Added an environment for an extra layer of protection if needed steps: - - name: Checkout repository + - name: Checkout Forked PR uses: actions/checkout@v4 with: - fetch-depth: 0 - # Checks out the actual branch of the PR + token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.event.pull_request.head.repo.full_name }} ref: ${{ github.event.pull_request.head.ref }} - token: ${{ secrets.PAT_TOKEN }} + fetch-depth: 0 - name: Set up JDK 21 uses: actions/setup-java@v4 @@ -37,16 +38,6 @@ jobs: with: python-version: '3.11' - - name: Cache Gradle - uses: actions/cache@v4 - with: - path: | - ~/.gradle/caches - ~/.gradle/wrapper - key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }} - restore-keys: | - ${{ runner.os }}-gradle- - - name: Install Python dependencies run: | python -m pip install --upgrade pip @@ -56,6 +47,7 @@ jobs: env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} run: | + # The script is now running on the code from the fork python translate.py \ --mode apply \ --locales "ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur" \ @@ -67,23 +59,19 @@ jobs: run: | ./gradlew :cmp-android:processDemoDebugResources - - name: Check for changes - id: check_changes - run: | - if git status --porcelain cmp-android/ feature/ | grep -q .; then - echo "has_changes=true" >> $GITHUB_OUTPUT - else - echo "has_changes=false" >> $GITHUB_OUTPUT - fi - - name: Commit and Push changes - if: steps.check_changes.outputs.has_changes == 'true' run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add cmp-android/ feature/ - git commit -m "chore: auto-generate mobile i18n translations" - # Pushes directly back to the PR head branch - git push origin HEAD:${{ github.event.pull_request.head.ref }} + # Only commit and push if there are actual changes + if ! git diff --cached --quiet; then + git commit -m "chore: auto-generate mobile i18n translations" + # Since checkout was done with GITHUB_TOKEN, origin is authenticated + git push origin HEAD:${{ github.event.pull_request.head.ref }} + else + echo "No translation changes detected." + fi + From 2c484d76ff96179cbacb14e78b36d49dc681bb14 Mon Sep 17 00:00:00 2001 From: Mark Rizkalla <46606022+markrizkalla@users.noreply.github.com> Date: Tue, 3 Feb 2026 13:31:39 +0200 Subject: [PATCH 5/9] Change PR trigger to pull_request --- .github/workflows/mobile-i18n-autofill-pr.yml | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml index 069f2cf8..9739c480 100644 --- a/.github/workflows/mobile-i18n-autofill-pr.yml +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -1,8 +1,7 @@ name: Mobile i18n Autofill (bot PR) on: - # Changed from pull_request to pull_request_target for write access - pull_request_target: + pull_request: branches: [dev] paths: - 'cmp-android/**' @@ -17,15 +16,14 @@ permissions: jobs: i18n-autofill: runs-on: ubuntu-latest - # Added an environment for an extra layer of protection if needed + steps: - - name: Checkout Forked PR + - name: Checkout repository uses: actions/checkout@v4 with: + fetch-depth: 0 token: ${{ secrets.GITHUB_TOKEN }} - repository: ${{ github.event.pull_request.head.repo.full_name }} ref: ${{ github.event.pull_request.head.ref }} - fetch-depth: 0 - name: Set up JDK 21 uses: actions/setup-java@v4 @@ -47,13 +45,11 @@ jobs: env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} run: | - # The script is now running on the code from the fork python translate.py \ --mode apply \ --locales "ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur" \ --model "gemma-3-27b-it" \ - --batch-size 15 \ - --request-delay 2 + --batch-size 15 - name: Validate Android resources compile run: | @@ -66,12 +62,11 @@ jobs: git add cmp-android/ feature/ - # Only commit and push if there are actual changes if ! git diff --cached --quiet; then git commit -m "chore: auto-generate mobile i18n translations" - # Since checkout was done with GITHUB_TOKEN, origin is authenticated + # This push will work ONLY if the PR is from the same repo git push origin HEAD:${{ github.event.pull_request.head.ref }} else - echo "No translation changes detected." + echo "No changes to commit." fi From eab5c7cb23f908d01dbb393a2964449535635326 Mon Sep 17 00:00:00 2001 From: Mark Rizkalla <46606022+markrizkalla@users.noreply.github.com> Date: Tue, 3 Feb 2026 13:45:37 +0200 Subject: [PATCH 6/9] Update GitHub Actions workflow for mobile i18n --- .github/workflows/mobile-i18n-autofill-pr.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml index 9739c480..aae20b3b 100644 --- a/.github/workflows/mobile-i18n-autofill-pr.yml +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -22,8 +22,9 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 - token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.event.pull_request.head.repo.full_name }} ref: ${{ github.event.pull_request.head.ref }} + token: ${{ secrets.GITHUB_TOKEN }} - name: Set up JDK 21 uses: actions/setup-java@v4 @@ -64,9 +65,8 @@ jobs: if ! git diff --cached --quiet; then git commit -m "chore: auto-generate mobile i18n translations" - # This push will work ONLY if the PR is from the same repo + # Since we checked out the fork, 'origin' is now the fork repo. git push origin HEAD:${{ github.event.pull_request.head.ref }} else echo "No changes to commit." fi - From 4dee8cc9ccca9ece9059330ecd377214330b39cd Mon Sep 17 00:00:00 2001 From: mark Date: Tue, 24 Feb 2026 03:58:04 +0200 Subject: [PATCH 7/9] feat: add support for string-array and plurals resources - Introduce `StringArrayEntry`, `PluralsEntry`, and `SourceResources` dataclasses to handle complex Android resource types. - Implement flattening logic to convert arrays and plurals into translatable `StringEntry` objects with unique key suffixes (`__item_N`, `__plural_QUANTITY`). - Update snapshot logic to hash and track changes for string-arrays and plurals. - Refactor XML parsing and writing to support merging and creating translations for all resource types while preserving source structure. - Expand file discovery to include `arrays.xml` alongside `strings.xml`. - Update translation prompts to provide guidance on handling plural forms and array item keys. --- translate.py | 916 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 622 insertions(+), 294 deletions(-) diff --git a/translate.py b/translate.py index 56a4675f..caa7422b 100644 --- a/translate.py +++ b/translate.py @@ -208,6 +208,103 @@ def get_propagated_attributes(self) -> Dict[str, str]: result[key] = value return result +@dataclass +class StringArrayEntry: + """A string-array resource entry with ordered items.""" + key: str + items: List[str] + attributes: Dict[str, str] = field(default_factory=dict) + + def flat_entries(self) -> List[StringEntry]: + """Flatten into individual StringEntry objects for translation.""" + return [ + StringEntry( + key=f"{self.key}__item_{i}", + text=text, + attributes=self.attributes, + ) + for i, text in enumerate(self.items) + if text and text.strip() + ] + + def content_for_hash(self) -> str: + """Combined content for snapshot hashing.""" + return "||".join(self.items) + + +@dataclass +class PluralsEntry: + """A plurals resource entry with quantity variants.""" + key: str + items: Dict[str, str] # quantity -> text + attributes: Dict[str, str] = field(default_factory=dict) + + def flat_entries(self) -> List[StringEntry]: + """Flatten into individual StringEntry objects for translation.""" + return [ + StringEntry( + key=f"{self.key}__plural_{quantity}", + text=text, + attributes=self.attributes, + ) + for quantity, text in self.items.items() + if text and text.strip() + ] + + def content_for_hash(self) -> str: + """Combined content for snapshot hashing.""" + return "||".join(f"{q}={t}" for q, t in sorted(self.items.items())) + + +@dataclass +class SourceResources: + """All translatable resources from a single source file.""" + strings: List[StringEntry] = field(default_factory=list) + string_arrays: List[StringArrayEntry] = field(default_factory=list) + plurals: List[PluralsEntry] = field(default_factory=list) + + @property + def total_count(self) -> int: + return ( + len(self.strings) + + sum(len(a.items) for a in self.string_arrays) + + sum(len(p.items) for p in self.plurals) + ) + + @property + def is_empty(self) -> bool: + return not self.strings and not self.string_arrays and not self.plurals + + def all_flat_entries(self) -> List[StringEntry]: + """All translatable items flattened into StringEntry list.""" + entries: List[StringEntry] = list(self.strings) + for arr in self.string_arrays: + entries.extend(arr.flat_entries()) + for plu in self.plurals: + entries.extend(plu.flat_entries()) + return entries + + def all_keys_for_snapshot(self) -> Dict[str, str]: + """Build key -> hash mapping for snapshot tracking.""" + data: Dict[str, str] = {} + for s in self.strings: + data[s.key] = content_hash(s.text) + for a in self.string_arrays: + data[f"__array__{a.key}"] = content_hash(a.content_for_hash()) + for p in self.plurals: + data[f"__plurals__{p.key}"] = content_hash(p.content_for_hash()) + return data + +@dataclass +class ExistingKeys: + """Track which keys already exist in a target file.""" + strings: Set[str] = field(default_factory=set) + string_arrays: Set[str] = field(default_factory=set) + plurals: Set[str] = field(default_factory=set) + + @property + def all_string_keys(self) -> Set[str]: + return self.strings @dataclass class FrozenText: @@ -346,81 +443,82 @@ def load_snapshot(snapshot_path: Path) -> Dict[str, str]: return {} -def save_snapshot(snapshot_path: Path, entries: List[StringEntry]) -> None: - """Save minimal snapshot: key -> hash only.""" +def save_snapshot_full( + snapshot_path: Path, source_resources: SourceResources +) -> None: + """Save snapshot for all resource types.""" try: snapshot_path.parent.mkdir(parents=True, exist_ok=True) - - data = {entry.key: content_hash(entry.text) for entry in entries} - - # Compact JSON format to minimize size + data = source_resources.all_keys_for_snapshot() snapshot_path.write_text( - json.dumps(data, sort_keys=True, separators=(',', ':')), - encoding='utf-8' + json.dumps(data, sort_keys=True, separators=(",", ":")), + encoding="utf-8", ) - logger.debug(f"Saved snapshot: {snapshot_path}") except (IOError, OSError) as e: logger.warning(f"Failed to save snapshot {snapshot_path}: {e}") -def find_changed_entries( - source_entries: List[StringEntry], +def find_changed_resources( + source_resources: SourceResources, snapshot: Dict[str, str], - existing_keys: Set[str], + existing_keys: ExistingKeys, ) -> List[StringEntry]: """ - Find entries where source text changed since last translation. - - Only returns entries that: - 1. Exist in the snapshot (were previously processed) - 2. Have different content hash now - 3. Already exist in target file (need re-translation, not new) + Find ALL changed entries (strings, array items, plural items) + returned as flat StringEntry list for translation. """ changed: List[StringEntry] = [] - for entry in source_entries: - # Skip if not in snapshot (new key, handled separately) + # Regular strings + for entry in source_resources.strings: if entry.key not in snapshot: continue - - # Skip if not already translated - if entry.key not in existing_keys: + if entry.key not in existing_keys.strings: continue + if snapshot[entry.key] != content_hash(entry.text): + changed.append(entry) - stored_hash = snapshot[entry.key] - current_hash = content_hash(entry.text) + # String arrays + for arr in source_resources.string_arrays: + snap_key = f"__array__{arr.key}" + if snap_key not in snapshot: + continue + if arr.key not in existing_keys.string_arrays: + continue + if snapshot[snap_key] != content_hash(arr.content_for_hash()): + changed.extend(arr.flat_entries()) - if stored_hash != current_hash: - changed.append(entry) - logger.debug( - f" Detected change in '{entry.key}': " - f"hash {stored_hash[:8]}... → {current_hash[:8]}..." - ) + # Plurals + for plu in source_resources.plurals: + snap_key = f"__plurals__{plu.key}" + if snap_key not in snapshot: + continue + if plu.key not in existing_keys.plurals: + continue + if snapshot[snap_key] != content_hash(plu.content_for_hash()): + changed.extend(plu.flat_entries()) return changed -def _snapshot_needs_update(snapshot: Dict[str, str], source_entries: List[StringEntry]) -> bool: - """Check if snapshot needs to be updated based on source changes.""" - # No snapshot exists +def _snapshot_needs_update_full( + snapshot: Dict[str, str], + source_resources: SourceResources, +) -> bool: + """Check if snapshot needs update based on ALL resource types.""" if not snapshot: return True - current_keys: Set[str] = set() - for entry in source_entries: - current_keys.add(entry.key) - current_hash = content_hash(entry.text) - stored_hash = snapshot.get(entry.key) - - # New key or changed content - if stored_hash != current_hash: - return True + current_data = source_resources.all_keys_for_snapshot() - # Check for removed keys - snapshot_keys = set(snapshot.keys()) - if snapshot_keys - current_keys: + # Check for any difference + if set(current_data.keys()) != set(snapshot.keys()): return True + for key, current_hash in current_data.items(): + if snapshot.get(key) != current_hash: + return True + return False @@ -569,45 +667,100 @@ def get_element_full_text(elem: ET._Element) -> str: # ============================================================================ -def read_source_strings(source_xml: Path) -> List[StringEntry]: - """Read translatable strings from source XML, preserving attributes.""" +def read_source_resources(source_xml: Path) -> SourceResources: + """Read ALL translatable resources from source XML.""" tree = ET.parse(str(source_xml), parser=XML_PARSER) root = tree.getroot() - entries: List[StringEntry] = [] + resources = SourceResources() - for node in root.iter("string"): - name = node.get("name") - if not name: - continue - if node.get("translatable", "true").lower() == "false": - continue - raw_text = get_element_full_text(node) - if not raw_text or not raw_text.strip(): + for node in root: + if is_comment(node): continue - preserved: Dict[str, str] = {} - for attr_key, attr_val in node.attrib.items(): - if attr_key in ("name", "translatable"): + + # ── ────────────────────────────────────────── + if node.tag == "string": + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": continue - if attr_key in PROPAGATE_ATTRIBUTES: - preserved[attr_key] = attr_val - elif attr_key.startswith(f"{{{TOOLS_NAMESPACE}}}"): - preserved[attr_key] = attr_val - entries.append(StringEntry(key=name, text=raw_text, attributes=preserved)) + raw_text = get_element_full_text(node) + if not raw_text or not raw_text.strip(): + continue + preserved = _extract_propagated_attrs(node) + resources.strings.append( + StringEntry(key=name, text=raw_text, attributes=preserved) + ) - return entries + # ── ──────────────────────────────────── + elif node.tag == "string-array": + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": + continue + items: List[str] = [] + for item_node in node.iter("item"): + item_text = get_element_full_text(item_node) + items.append(item_text or "") + if not any(t.strip() for t in items): + continue + preserved = _extract_propagated_attrs(node) + resources.string_arrays.append( + StringArrayEntry(key=name, items=items, attributes=preserved) + ) + # ── ───────────────────────────────────────── + elif node.tag == "plurals": + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": + continue + quantity_map: Dict[str, str] = {} + for item_node in node.iter("item"): + quantity = item_node.get("quantity") + if quantity: + item_text = get_element_full_text(item_node) + if item_text: + quantity_map[quantity] = item_text + if not quantity_map: + continue + preserved = _extract_propagated_attrs(node) + resources.plurals.append( + PluralsEntry(key=name, items=quantity_map, attributes=preserved) + ) -def read_existing_keys(target_xml: Path) -> Set[str]: - """Read existing string keys from target file.""" + return resources + + +def _extract_propagated_attrs(node: ET._Element) -> Dict[str, str]: + """Extract attributes to propagate from a source node.""" + preserved: Dict[str, str] = {} + for attr_key, attr_val in node.attrib.items(): + if attr_key in ("name", "translatable"): + continue + if attr_key in PROPAGATE_ATTRIBUTES: + preserved[attr_key] = attr_val + elif attr_key.startswith(f"{{{TOOLS_NAMESPACE}}}"): + preserved[attr_key] = attr_val + return preserved + + +def read_existing_keys_full(target_xml: Path) -> ExistingKeys: + """Read existing resource keys from target file (all types).""" + result = ExistingKeys() if not target_xml.exists(): - return set() + return result try: tree = ET.parse(str(target_xml), parser=XML_PARSER) root = tree.getroot() - return set(root.xpath("./string/@name")) + result.strings = set(root.xpath("./string/@name")) + result.string_arrays = set(root.xpath("./string-array/@name")) + result.plurals = set(root.xpath("./plurals/@name")) + return result except ET.XMLSyntaxError: - return set() - + return result # ============================================================================ # XML Writing Functions @@ -656,152 +809,143 @@ def set_mixed_string_value( escape_android_text_nodes(node) -def write_translations( +def write_translations_full( target_xml: Path, - translations: Dict[str, str], - source_entries: List[StringEntry], + translations: Dict[str, str], # flat key -> translated text + source_resources: SourceResources, source_xml: Path, validate: bool = True, warn_unknown_tags: bool = True, ) -> int: """ - Write translations to target XML, preserving EXACT source structure. + Write translations including string-arrays and plurals. - For NEW files: Deep copies source, replaces content, removes untranslated strings - For EXISTING files: Merges new translations while preserving structure + The `translations` dict uses flat keys: + - "key" -> string translation + - "key__item_0" -> string-array item + - "key__plural_one" -> plurals quantity variant """ target_xml.parent.mkdir(parents=True, exist_ok=True) - # Parse source with all whitespace and comments preserved source_tree = ET.parse(str(source_xml), parser=XML_PARSER) source_root = source_tree.getroot() - # Check if target already exists if target_xml.exists(): try: existing_tree = ET.parse(str(target_xml), parser=XML_PARSER) existing_root = existing_tree.getroot() - existing_keys = set(existing_root.xpath("./string/@name")) - - return _merge_into_existing( - target_xml, existing_root, translations, source_entries, - source_root, existing_keys, validate, warn_unknown_tags + return _merge_all_into_existing( + target_xml, existing_root, translations, + source_resources, source_root, validate, warn_unknown_tags ) except ET.XMLSyntaxError as e: logger.warning(f"Corrupted '{target_xml}', recreating: {e}") - # Create new file from source structure - return _create_from_source( - target_xml, translations, source_entries, + return _create_from_source_full( + target_xml, translations, source_resources, source_root, validate, warn_unknown_tags ) -def _create_from_source( +def _create_from_source_full( target_xml: Path, translations: Dict[str, str], - source_entries: List[StringEntry], + source_resources: SourceResources, source_root: ET._Element, validate: bool, warn_unknown_tags: bool, ) -> int: - """ - Create new translation file by deep copying source and replacing text. - Preserves all comments, whitespace, and exact ordering. - """ - # Deep copy preserves everything + """Create new file from source, filling in all resource types.""" root = copy.deepcopy(source_root) - # Build set of keys that have translations - translated_keys: Set[str] = set(translations.keys()) + # Build lookup sets + translated_string_keys: Set[str] = set() + translated_array_keys: Set[str] = set() + translated_plural_keys: Set[str] = set() + + for flat_key in translations: + if "__item_" in flat_key: + base_key = flat_key.rsplit("__item_", 1)[0] + translated_array_keys.add(base_key) + elif "__plural_" in flat_key: + base_key = flat_key.rsplit("__plural_", 1)[0] + translated_plural_keys.add(base_key) + else: + translated_string_keys.add(flat_key) - # Build sections: list of (comment_elements, string_elements) - # Each section starts with zero or more comments followed by strings - sections: List[Tuple[List[ET._Element], List[ET._Element]]] = [] - current_comments: List[ET._Element] = [] - current_strings: List[ET._Element] = [] + elements_to_remove: List[ET._Element] = [] + written = 0 for elem in list(root): if is_comment(elem): - if current_strings: - # Save previous section and start new one - sections.append((current_comments, current_strings)) - current_comments = [] - current_strings = [] - current_comments.append(elem) - elif elem.tag == "string": - current_strings.append(elem) - - if current_comments or current_strings: - sections.append((current_comments, current_strings)) - - written = 0 - elements_to_remove: List[ET._Element] = [] - - # Process each section - for comments, strings in sections: - # Check if this section has any translated strings - section_has_translation = False - for string_elem in strings: - name = string_elem.get("name") - if name and name in translated_keys: - section_has_translation = True - break - # Non-translatable strings don't count - if string_elem.get("translatable", "true").lower() == "false": - section_has_translation = True # Keep non-translatable - break - - if not section_has_translation: - # Remove entire section (comments + strings) - elements_to_remove.extend(comments) - elements_to_remove.extend(strings) continue - # Process strings in this section - for string_elem in strings: - name = string_elem.get("name") - - if not name: - elements_to_remove.append(string_elem) - continue + name = elem.get("name") + if not name: + continue - # Keep non-translatable strings unchanged - if string_elem.get("translatable", "true").lower() == "false": - continue + # Keep non-translatable as-is + if elem.get("translatable", "true").lower() == "false": + continue - if name in translations: - # Update with translation + if elem.tag == "string": + if name in translated_string_keys: value = translations[name] - - # Clear content - string_elem.text = None - for child in list(string_elem): - string_elem.remove(child) - - set_mixed_string_value(string_elem, value, key=name, warn_unknown_tags=warn_unknown_tags) + elem.text = None + for child in list(elem): + elem.remove(child) + set_mixed_string_value( + elem, value, key=name, + warn_unknown_tags=warn_unknown_tags, + ) written += 1 else: - # No translation - remove this string - elements_to_remove.append(string_elem) + elements_to_remove.append(elem) + + elif elem.tag == "string-array": + if name in translated_array_keys: + item_nodes = list(elem.iter("item")) + for i, item_node in enumerate(item_nodes): + flat_key = f"{name}__item_{i}" + if flat_key in translations: + value = translations[flat_key] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, key=flat_key, + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + else: + elements_to_remove.append(elem) + + elif elem.tag == "plurals": + if name in translated_plural_keys: + for item_node in elem.iter("item"): + quantity = item_node.get("quantity") + if quantity: + flat_key = f"{name}__plural_{quantity}" + if flat_key in translations: + value = translations[flat_key] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, key=flat_key, + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + else: + elements_to_remove.append(elem) - # Remove elements while preserving whitespace for elem in elements_to_remove: _remove_element_preserve_whitespace(root, elem) - # Clean up redundant namespace declarations ET.cleanup_namespaces(root) - - # Write file tree = ET.ElementTree(root) - tree.write( - str(target_xml), - encoding="utf-8", - xml_declaration=True, - pretty_print=False, - ) - - # Post-process to fix any xliff namespace prefix issues (ns0, ns1 -> xliff) + tree.write(str(target_xml), encoding="utf-8", + xml_declaration=True, pretty_print=False) _fix_xliff_namespaces_in_file(target_xml) if validate: @@ -902,141 +1046,244 @@ def _remove_element_preserve_whitespace(root: ET._Element, elem: ET._Element) -> parent.remove(elem) -def _merge_into_existing( +def _merge_all_into_existing( target_xml: Path, existing_root: ET._Element, translations: Dict[str, str], - source_entries: List[StringEntry], + source_resources: SourceResources, source_root: ET._Element, - existing_keys: Set[str], validate: bool, warn_unknown_tags: bool, ) -> int: - """Merge new translations into existing file, preserving source order and comments.""" + """ + Merge new/updated translations into existing file for ALL resource types. - # Build source structure: sections with their comments and strings - source_sections: List[Tuple[List[str], List[str]]] = [] # (comment_texts, string_names) - current_comments: List[str] = [] - current_strings: List[str] = [] + Handles: + - entries (new + updated) + - entries (new + updated items) + - entries (new + updated quantities) + """ + # ── Build lookup maps ────────────────────────────────────── + + # Flat key -> which resource type and base key + array_items_map: Dict[str, Tuple[str, int]] = {} # flat_key -> (array_name, index) + plural_items_map: Dict[str, Tuple[str, str]] = {} # flat_key -> (plural_name, quantity) + string_keys: Set[str] = set() + + for flat_key in translations: + if "__item_" in flat_key: + parts = flat_key.rsplit("__item_", 1) + array_items_map[flat_key] = (parts[0], int(parts[1])) + elif "__plural_" in flat_key: + parts = flat_key.rsplit("__plural_", 1) + plural_items_map[flat_key] = (parts[0], parts[1]) + else: + string_keys.add(flat_key) - # Also track string -> section mapping and string -> preceding whitespace - string_to_section: Dict[str, int] = {} - string_tail: Dict[str, str] = {} - comment_tails: Dict[int, str] = {} # section_index -> tail after last comment + # Group array items by array name + array_translations: Dict[str, Dict[int, str]] = {} # array_name -> {index: text} + for flat_key, (arr_name, idx) in array_items_map.items(): + if arr_name not in array_translations: + array_translations[arr_name] = {} + array_translations[arr_name][idx] = translations[flat_key] + + # Group plural items by plural name + plural_translations: Dict[str, Dict[str, str]] = {} # plural_name -> {quantity: text} + for flat_key, (plu_name, quantity) in plural_items_map.items(): + if plu_name not in plural_translations: + plural_translations[plu_name] = {} + plural_translations[plu_name][quantity] = translations[flat_key] + + # ── Get existing elements ────────────────────────────────── + + existing_string_elems: Dict[str, ET._Element] = {} + existing_array_elems: Dict[str, ET._Element] = {} + existing_plural_elems: Dict[str, ET._Element] = {} - for elem in source_root: - if is_comment(elem): - if current_strings: - source_sections.append((current_comments, current_strings)) - current_comments = [] - current_strings = [] - current_comments.append(elem.text or "") - if elem.tail: - comment_tails[len(source_sections)] = elem.tail - elif elem.tag == "string": - name = elem.get("name") - if name: - current_strings.append(name) - string_to_section[name] = len(source_sections) - string_tail[name] = elem.tail or "\n " - - if current_comments or current_strings: - source_sections.append((current_comments, current_strings)) - - # Build flat source order - source_order: List[str] = [] - for comments, strings in source_sections: - source_order.extend(strings) - - # Get existing elements map - existing_elems: Dict[str, ET._Element] = {} for elem in existing_root: if is_comment(elem): continue + name = elem.get("name") + if not name: + continue if elem.tag == "string": - name = elem.get("name") - if name: - existing_elems[name] = elem + existing_string_elems[name] = elem + elif elem.tag == "string-array": + existing_array_elems[name] = elem + elif elem.tag == "plurals": + existing_plural_elems[name] = elem + + # ── Build source ordering ────────────────────────────────── + + source_order: List[Tuple[str, str]] = [] # (tag, name) preserving source order + source_comments: Dict[int, List[str]] = {} # index -> preceding comment texts + current_comments: List[str] = [] + + for elem in source_root: + if is_comment(elem): + current_comments.append(elem.text or "") + continue + name = elem.get("name") + if name and elem.tag in ("string", "string-array", "plurals"): + idx = len(source_order) + if current_comments: + source_comments[idx] = list(current_comments) + current_comments = [] + source_order.append((elem.tag, name)) + + # ── Source entry map for attributes ──────────────────────── + + entry_map = {e.key: e for e in source_resources.strings} + array_entry_map = {a.key: a for a in source_resources.string_arrays} + plural_entry_map = {p.key: p for p in source_resources.plurals} - entry_map = {e.key: e for e in source_entries} written = 0 - # Track which sections we've added comments for - added_section_comments: Set[int] = set() + # ── 1. Process regular strings ───────────────────────────── - # Process translations (both new and updated) - for key, value in translations.items(): + for key in string_keys: + value = translations[key] entry = entry_map.get(key) - if not entry: - continue - # Check if this is an UPDATE to existing string - if key in existing_elems: - # Update existing element in place - node = existing_elems[key] + if key in existing_string_elems: + # Update existing + node = existing_string_elems[key] node.text = None for child in list(node): node.remove(child) set_mixed_string_value(node, value, key=key, warn_unknown_tags=warn_unknown_tags) written += 1 - continue - - # This is a NEW string - add it - section_idx = string_to_section.get(key, 0) - - # Find insertion point - key_idx = source_order.index(key) if key in source_order else len(source_order) - insert_before: Optional[ET._Element] = None - for next_key in source_order[key_idx + 1:]: - if next_key in existing_elems: - insert_before = existing_elems[next_key] - break + elif entry: + # Add new string + attrs = entry.get_propagated_attributes() + node = ET.Element("string", **attrs) + set_mixed_string_value(node, value, key=key, warn_unknown_tags=warn_unknown_tags) + node.tail = "\n " + _insert_at_source_position( + existing_root, node, "string", key, + source_order, existing_string_elems, + existing_array_elems, existing_plural_elems, + ) + existing_string_elems[key] = node + written += 1 - # Add section comments if not already added - if section_idx not in added_section_comments: - comments, _ = source_sections[section_idx] if section_idx < len(source_sections) else ([], []) - if comments: - for comment_text in comments: - comment = ET.Comment(comment_text) - comment.tail = "\n " - - if insert_before is not None: - # Add blank line before section - prev = insert_before.getprevious() - if prev is not None and not is_comment(prev): - prev.tail = "\n\n " - insert_before.addprevious(comment) - else: - # Append at end - children = list(existing_root) - if children: - last = children[-1] - if not is_comment(last): - last.tail = "\n\n " - existing_root.append(comment) + # ── 2. Process string-arrays ─────────────────────────────── - added_section_comments.add(section_idx) + for arr_name, item_translations in array_translations.items(): + arr_entry = array_entry_map.get(arr_name) + if not arr_entry: + continue - # Create string element - attrs = entry.get_propagated_attributes() - node = ET.Element("string", **attrs) + if arr_name in existing_array_elems: + # Update existing array items + arr_elem = existing_array_elems[arr_name] + item_nodes = list(arr_elem.iter("item")) + + for idx, value in item_translations.items(): + if idx < len(item_nodes): + # Update existing item + item_node = item_nodes[idx] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, + key=f"{arr_name}[{idx}]", + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + else: + # Create new string-array from source structure + source_arr_elem = None + for elem in source_root: + if elem.tag == "string-array" and elem.get("name") == arr_name: + source_arr_elem = elem + break + + if source_arr_elem is not None: + new_arr = copy.deepcopy(source_arr_elem) + item_nodes = list(new_arr.iter("item")) + + for idx, value in item_translations.items(): + if idx < len(item_nodes): + item_node = item_nodes[idx] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, + key=f"{arr_name}[{idx}]", + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + + new_arr.tail = "\n\n " + _insert_at_source_position( + existing_root, new_arr, "string-array", arr_name, + source_order, existing_string_elems, + existing_array_elems, existing_plural_elems, + ) + existing_array_elems[arr_name] = new_arr - set_mixed_string_value(node, value, key=key, warn_unknown_tags=warn_unknown_tags) + # ── 3. Process plurals ───────────────────────────────────── - # Set tail from source - node.tail = string_tail.get(key, "\n ") + for plu_name, qty_translations in plural_translations.items(): + plu_entry = plural_entry_map.get(plu_name) + if not plu_entry: + continue - # Insert element - if insert_before is not None: - insert_before.addprevious(node) + if plu_name in existing_plural_elems: + # Update existing plural items + plu_elem = existing_plural_elems[plu_name] + + for item_node in plu_elem.iter("item"): + quantity = item_node.get("quantity") + if quantity and quantity in qty_translations: + value = qty_translations[quantity] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, + key=f"{plu_name}[{quantity}]", + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 else: - existing_root.append(node) + # Create new plurals from source structure + source_plu_elem = None + for elem in source_root: + if elem.tag == "plurals" and elem.get("name") == plu_name: + source_plu_elem = elem + break + + if source_plu_elem is not None: + new_plu = copy.deepcopy(source_plu_elem) + + for item_node in new_plu.iter("item"): + quantity = item_node.get("quantity") + if quantity and quantity in qty_translations: + value = qty_translations[quantity] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, + key=f"{plu_name}[{quantity}]", + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + + new_plu.tail = "\n\n " + _insert_at_source_position( + existing_root, new_plu, "plurals", plu_name, + source_order, existing_string_elems, + existing_array_elems, existing_plural_elems, + ) + existing_plural_elems[plu_name] = new_plu - existing_keys.add(key) - existing_elems[key] = node - written += 1 + # ── Fix final element tail ───────────────────────────────── - # Fix final element tail children = list(existing_root) if children: for child in reversed(children): @@ -1045,10 +1292,10 @@ def _merge_into_existing( child.tail = "\n" break + # ── Write file ───────────────────────────────────────────── + if written > 0: - # Clean up redundant namespace declarations ET.cleanup_namespaces(existing_root) - tree = ET.ElementTree(existing_root) tree.write( str(target_xml), @@ -1056,8 +1303,6 @@ def _merge_into_existing( xml_declaration=True, pretty_print=False, ) - - # Post-process to fix any xliff namespace prefix issues (ns0, ns1 -> xliff) _fix_xliff_namespaces_in_file(target_xml) if validate: @@ -1069,6 +1314,62 @@ def _merge_into_existing( return written +def _insert_at_source_position( + root: ET._Element, + new_elem: ET._Element, + tag: str, + name: str, + source_order: List[Tuple[str, str]], + existing_strings: Dict[str, ET._Element], + existing_arrays: Dict[str, ET._Element], + existing_plurals: Dict[str, ET._Element], +) -> None: + """ + Insert element at the correct position matching source file ordering. + Falls back to appending at end if no reference point found. + """ + # Find this element's position in source order + try: + my_idx = next( + i for i, (t, n) in enumerate(source_order) + if t == tag and n == name + ) + except StopIteration: + # Not found in source order, append at end + root.append(new_elem) + return + + # Look forward in source order for an existing element to insert before + for future_tag, future_name in source_order[my_idx + 1:]: + ref_elem = None + if future_tag == "string": + ref_elem = existing_strings.get(future_name) + elif future_tag == "string-array": + ref_elem = existing_arrays.get(future_name) + elif future_tag == "plurals": + ref_elem = existing_plurals.get(future_name) + + if ref_elem is not None: + ref_elem.addprevious(new_elem) + return + + # Look backward for an element to insert after + for past_tag, past_name in reversed(source_order[:my_idx]): + ref_elem = None + if past_tag == "string": + ref_elem = existing_strings.get(past_name) + elif past_tag == "string-array": + ref_elem = existing_arrays.get(past_name) + elif past_tag == "plurals": + ref_elem = existing_plurals.get(past_name) + + if ref_elem is not None: + ref_elem.addnext(new_elem) + return + + # Nothing found, append at end + root.append(new_elem) + # ============================================================================ # File Discovery # ============================================================================ @@ -1077,10 +1378,13 @@ def _merge_into_existing( def find_source_files(repo_root: Path, exclude_dirs: FrozenSet[str]) -> List[Path]: """Find all source strings.xml files in the repository.""" paths: List[Path] = [] - patterns = [ - "src/*/res/values/strings.xml", - "src/*/composeResources/values/strings.xml", - ] + resource_filenames = ("strings.xml", "arrays.xml") + patterns = [] + + for fname in resource_filenames: + patterns.append(f"src/*/res/values/{fname}") + patterns.append(f"src/*/composeResources/values/{fname}") + for pat in patterns: for p in repo_root.rglob(pat): if any(part in exclude_dirs for part in p.parts): @@ -1099,7 +1403,7 @@ def get_target_path(source_xml: Path, locale: str) -> Path: raise ValueError(f"Invalid locale: {locale}") values_dir = source_xml.parent parent = values_dir.parent - return parent / f"values-{locale}" / "strings.xml" + return parent / f"values-{locale}" / source_xml.name def get_module_name(source_path: Path) -> str: @@ -1142,6 +1446,11 @@ class GeminiTranslator: 4. KEYS: - Every input key must appear exactly once in the output. + - Keys may contain __item_N (array items) or __plural_QUANTITY + - (plural forms) suffixes — translate the TEXT only, never the key. + +5. PLURALS: For __plural_one, __plural_other, __plural_few, etc., + use the grammatically correct plural form for the target language. 5. WHITESPACE: - Preserve leading and trailing spaces if present in the original. @@ -1328,33 +1637,50 @@ def process_locale( config: Config, translator: Optional[GeminiTranslator], snapshot: Dict[str, str], - source_entries: List[StringEntry], + source_resources: SourceResources, ) -> LocaleResult: """Process translations for a single source file and locale.""" target_xml = get_target_path(source_xml, locale) result = LocaleResult(locale=locale, source_path=source_xml, target_path=target_xml) - result.total_source = len(source_entries) + result.total_source = source_resources.total_count - if not source_entries: + + if source_resources.is_empty: logger.warning(f"No translatable strings in {source_xml}") return result - existing_keys = read_existing_keys(target_xml) - result.already_translated = len(existing_keys & {e.key for e in source_entries}) + + all_flat = source_resources.all_flat_entries() + all_flat_keys = {e.key for e in all_flat} + + existing = read_existing_keys_full(target_xml) + + existing_flat_keys: Set[str] = set(existing.strings) + + for arr in source_resources.string_arrays: + if arr.key in existing.string_arrays: + for fe in arr.flat_entries(): + existing_flat_keys.add(fe.key) + for plu in source_resources.plurals: + if plu.key in existing.plurals: + for fe in plu.flat_entries(): + existing_flat_keys.add(fe.key) + + result.already_translated = len(existing_flat_keys & all_flat_keys) # Find missing entries (new keys not yet translated) - missing_entries = [e for e in source_entries if e.key not in existing_keys] + missing_entries = [e for e in all_flat if e.key not in existing_flat_keys] - # Find changed entries (source text modified since last translation) - changed_entries = find_changed_entries(source_entries, snapshot, existing_keys) - result.changed_count = len(changed_entries) + # Changed entries + changed_entries = find_changed_resources(source_resources, snapshot, existing) + result.changed_count = len(changed_entries ) # Combine both lists entries_to_translate = missing_entries + changed_entries if not entries_to_translate: - logger.info(f" [{locale}] All {result.total_source} strings up to date") + logger.info(f" [{locale}] All {result.total_source} items up to date") return result # Log what needs translation @@ -1432,10 +1758,10 @@ def process_locale( if translations: try: - written = write_translations( + written = write_translations_full( target_xml=target_xml, translations=translations, - source_entries=entries_to_translate, + source_resources=source_resources, source_xml=source_xml, validate=config.validate_output, warn_unknown_tags=config.warn_unknown_tags, @@ -1472,10 +1798,12 @@ def process_all(config: Config) -> ProcessingResult: snapshot = load_snapshot(snapshot_path) # Read source entries once per source file - source_entries = read_source_strings(source_xml) + source_resources = read_source_resources(source_xml) # Determine if snapshot needs update - snapshot_needs_update = _snapshot_needs_update(snapshot, source_entries) + snapshot_needs_update = _snapshot_needs_update_full( + snapshot, source_resources + ) if snapshot_needs_update and snapshot: logger.debug(f" Source strings changed since last snapshot") @@ -1485,7 +1813,7 @@ def process_all(config: Config) -> ProcessingResult: for locale in config.locales: locale_result = process_locale( - source_xml, locale, config, translator, snapshot, source_entries + source_xml, locale, config, translator, snapshot, source_resources ) result.locale_results.append(locale_result) @@ -1511,7 +1839,7 @@ def process_all(config: Config) -> ProcessingResult: save_reason = "Created" if should_save_snapshot: - save_snapshot(snapshot_path, source_entries) + save_snapshot_full(snapshot_path, source_resources) logger.info(f" {save_reason} snapshot: {snapshot_path.name}") return result From a33d7f1897fab2ea42f881b48ce201cdf8cbbbc1 Mon Sep 17 00:00:00 2001 From: mark Date: Thu, 26 Feb 2026 11:04:42 +0200 Subject: [PATCH 8/9] feat(translate): add orphaned translation cleanup and improve XML formatting - Implement `_cleanup_orphaned_translations` to remove localized strings, arrays, and plurals that no longer exist in the source English file. - Add logic to remove orphaned comments preceding deleted resource elements. - Improve XML whitespace management with `_normalize_resource_whitespace` to prevent empty line buildup during updates. - Enhance placeholder detection to include `\n` and `\t` as frozen tokens. - Update GitHub Action to support manual triggers (`workflow_dispatch`) with configurable parameters for locales, models, and batch sizes. - Expand GitHub Action to include `cmp-navigation/` in automated commits. - Fix XML header and resource tag indentation in generated files. --- .github/workflows/mobile-i18n-autofill-pr.yml | 54 ++++- docs/TRANSLATE.md | 81 +++++++ translate.py | 225 ++++++++++++++---- 3 files changed, 295 insertions(+), 65 deletions(-) create mode 100644 docs/TRANSLATE.md diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml index aae20b3b..74e11ce0 100644 --- a/.github/workflows/mobile-i18n-autofill-pr.yml +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -1,13 +1,32 @@ name: Mobile i18n Autofill (bot PR) on: + workflow_dispatch: + inputs: + locales: + description: 'Comma-separated locale codes to translate' + required: false + default: 'ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur' + type: string + model: + description: 'Gemini model to use for translation' + required: false + default: 'gemma-3-27b-it' + type: string + batch-size: + description: 'Number of strings per translation batch' + required: false + default: '15' + type: string + repo-root: + description: 'Module root to scope translation (e.g. ./feature/settings)' + required: false + default: '.' + type: string + pull_request: + types: [labeled] branches: [dev] - paths: - - 'cmp-android/**' - - 'feature/**' - - '.github/workflows/mobile-i18n-autofill-pr.yml' - - 'translate.py' permissions: contents: write @@ -15,15 +34,24 @@ permissions: jobs: i18n-autofill: + if: >- + github.event_name == 'workflow_dispatch' || + github.event.label.name == 'needs-translation' runs-on: ubuntu-latest + env: + LOCALES: ${{ inputs.locales || 'ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur' }} + MODEL: ${{ inputs.model || 'gemma-3-27b-it' }} + BATCH_SIZE: ${{ inputs.batch-size || '15' }} + REPO_ROOT: ${{ inputs.repo-root || '.' }} + steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - repository: ${{ github.event.pull_request.head.repo.full_name }} - ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} + ref: ${{ github.event.pull_request.head.ref || github.ref }} token: ${{ secrets.GITHUB_TOKEN }} - name: Set up JDK 21 @@ -48,9 +76,10 @@ jobs: run: | python translate.py \ --mode apply \ - --locales "ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur" \ - --model "gemma-3-27b-it" \ - --batch-size 15 + --repo-root "$REPO_ROOT" \ + --locales "$LOCALES" \ + --model "$MODEL" \ + --batch-size $BATCH_SIZE - name: Validate Android resources compile run: | @@ -61,12 +90,11 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - git add cmp-android/ feature/ + git add cmp-android/ cmp-navigation/ feature/ if ! git diff --cached --quiet; then git commit -m "chore: auto-generate mobile i18n translations" - # Since we checked out the fork, 'origin' is now the fork repo. - git push origin HEAD:${{ github.event.pull_request.head.ref }} + git push origin HEAD:${{ github.event.pull_request.head.ref || github.ref_name }} else echo "No changes to commit." fi diff --git a/docs/TRANSLATE.md b/docs/TRANSLATE.md new file mode 100644 index 00000000..c6ee58e0 --- /dev/null +++ b/docs/TRANSLATE.md @@ -0,0 +1,81 @@ +# Android String Resource Translator (`translate.py`) + +A production-ready Python script for translating Android string resources (`strings.xml` and `arrays.xml`) using the Google Gemini API. + +## Features + +- **Format Preservation**: Ensures comments, spacing (blank lines), and structure match the source file exactly. +- **Placeholder & Markup Safety**: Freezes placeholders (e.g., `%s`, `%1$d`) and markup tags (e.g., ``, ``) before translating to guarantee they are preserved and kept in the correct order. +- **Source Attribute Propagation**: Copies attributes like `formatted`, `product`, and `tools:*` to the translated strings. +- **Robust Error Handling**: Includes batch translation with individual string fallback on failure, and automatic retry mechanisms for rate limits (429) or model overloads (503). +- **Change Detection**: Tracks source strings through a simple hash-based snapshot mechanism (`.translation_snapshots/`). Only new strings and strings whose source text modified are re-translated, saving time and tokens. +- **Advanced Resource Support**: Translates single ``, ordered ``, and `` resources out-of-the-box. +- **Character Compatibility**: Manages HTML entity conversions and robust Android special character escaping. +- **AAPT2 Compatibility**: Implements proper `xliff` namespace handling to prevent build errors. + +## Requirements + +1. Python 3.8+ +2. Required packages: + ```bash + pip install google-genai lxml + ``` +3. A Google Gemini API Key + +## Usage + +Set your Google Gemini API key as an environment variable: +```bash +export GEMINI_API_KEY=your_api_key_here +``` +*(You can customize the environment variable name via the `--api-key-env` flag).* + +### Applying Translations + +Run the script in `apply` mode to fetch missing strings and write translated files directly to their respective `values-{locale}` folders. + +```bash +# Basic usage +python translate.py --mode apply --locales es,de,fr + +# Using a specific model and fine-tuning batch parameters +python translate.py \ + --mode apply \ + --repo-root . \ + --locales ar \ + --model gemma-3-27b-it \ + --batch-size 15 \ + --request-delay 4.0 +``` + +### Checking for Missing Translations + +Run the script in `check` mode inside CI/CD workflows to simply verify whether all strings are translated without making any actual API calls or file modifications. + +```bash +python translate.py --mode check --locales es,de,fr +``` +*In `check` mode, the script exits with code `2` if translations are missing.* + +## Available Command-Line Arguments + +- `--mode` (Required): `apply` (to translate and write xml) or `check` (to only check for missing keys). +- `--locales`: A comma-separated list of target Android language/region codes (e.g. `es,fr,de,ar`). Default is `es,de`. +- `--repo-root`: The path to the root of the Android project (where to search for `src/*/res/values/strings.xml` or Compose Multiplatform equivalent). Default is `.`. +- `--model`: The Gemini API model to use. Default is `gemini-2.0-flash`. +- `--batch-size`: Number of strings to send in a single Gemini API request. Default is `20` (capped at `15` for Gemma models). +- `--request-delay`: Delay in seconds between API requests to prevent immediate rate-limiting. Default is `2.0` (forced to `4.0` for Gemma models). +- `--api-key-env`: Name of the environment variable used to retrieve the API key. Default is `GEMINI_API_KEY`. +- `--no-validate`: Disable automatic malformed XML checks after writing translations. +- `--verbose` / `-v`: Enable debug-level logging. + +## Under The Hood + +### 1. Snapshot Tracking +When you successfully translate strings, the script saves a JSON file in `.translation_snapshots/` within the source module. Subsequent runs will compare current source text against these hashes, allowing `translate.py` to seamlessly fix previously translated strings if you tweak the original English wording. + +### 2. Orphaned Translations cleanup +In `apply` mode, if a developer deletes a string or an array item from the english source, the script reliably detects and strips the orphaned translation from all localized strings files to avoid accumulation of unused strings. + +### 3. Rate Limit Handling +If the Google Gemini backend responds with `429 Rate limited` or `503 Service Unavailable`, `translate.py` will automatically backoff and retry according to `--max-retries` and the wait times embedded in API responses. diff --git a/translate.py b/translate.py index caa7422b..b3854a73 100644 --- a/translate.py +++ b/translate.py @@ -104,6 +104,8 @@ PLACEHOLDER_PATTERNS = [ r"%%", r"%n", + r"\\n", + r"\\t", r"%(?:\d+\$)?[-+# 0,(]*\d*(?:\.\d+)?[sdbBhHoOxXeEfgGaAcC]", r"%(?:\d+\$)?[-+# 0,(]*\d*(?:\.\d+)?t[HIklMSLNpzZsQBbhAaCYyjmdeRTrDFc]", ] @@ -412,14 +414,11 @@ def get_snapshot_path(source_xml: Path, repo_root: Path) -> Path: if "src" in parts: src_index = parts.index("src") - # Module root is everything before "src" module_root = Path(*parts[:src_index]) - # Relative path from module root (including "src") relative_parts = parts[src_index:] safe_name = "_".join(relative_parts) return module_root / ".translation_snapshots" / f"{safe_name}.json" - # Fallback: use repo root try: relative = source_xml.relative_to(repo_root) safe_name = str(relative).replace("/", "_").replace("\\", "_") @@ -511,7 +510,6 @@ def _snapshot_needs_update_full( current_data = source_resources.all_keys_for_snapshot() - # Check for any difference if set(current_data.keys()) != set(snapshot.keys()): return True @@ -847,6 +845,107 @@ def write_translations_full( ) +def _cleanup_orphaned_translations( + target_xml: Path, + source_resources: SourceResources, +) -> int: + """ + Remove entries from target file that no longer exist in source. + Returns count of removed entries. + """ + if not target_xml.exists(): + return 0 + + try: + tree = ET.parse(str(target_xml), parser=XML_PARSER) + root = tree.getroot() + except ET.XMLSyntaxError: + return 0 + + source_string_keys: Set[str] = {e.key for e in source_resources.strings} + source_array_keys: Set[str] = {a.key for a in source_resources.string_arrays} + source_plural_keys: Set[str] = {p.key for p in source_resources.plurals} + + + elements_to_remove: List[ET._Element] = [] + removed_names: List[str] = [] + + for elem in list(root): + if is_comment(elem): + continue + + name = elem.get("name") + if not name: + continue + + if elem.tag == "string": + if name not in source_string_keys: + elements_to_remove.append(elem) + removed_names.append(f"string:{name}") + + elif elem.tag == "string-array": + if name not in source_array_keys: + elements_to_remove.append(elem) + removed_names.append(f"string-array:{name}") + + elif elem.tag == "plurals": + if name not in source_plural_keys: + elements_to_remove.append(elem) + removed_names.append(f"plurals:{name}") + + if not elements_to_remove: + return 0 + + for elem in elements_to_remove: + _remove_element_and_orphaned_comments(root, elem) + + _normalize_resource_whitespace(root) + + children = list(root) + if children: + for child in reversed(children): + if not is_comment(child): + if not child.tail or not child.tail.endswith("\n"): + child.tail = "\n" + break + + ET.cleanup_namespaces(root) + tree = ET.ElementTree(root) + tree.write( + str(target_xml), + encoding="utf-8", + xml_declaration=True, + pretty_print=False, + ) + _fix_xliff_namespaces_in_file(target_xml) + + for name in removed_names: + logger.info(f" ✕ Removed orphaned: {name}") + + return len(elements_to_remove) + +def _remove_element_and_orphaned_comments( + root: ET._Element, elem: ET._Element +) -> None: + """ + Remove element AND any preceding comments that would become orphaned. + + Example: if removing the last string under comment, + remove the comment too. + """ + parent = elem.getparent() + if parent is None: + return + + prev = elem.getprevious() + + _remove_element_preserve_whitespace(root, elem) + + if prev is not None and is_comment(prev): + next_sibling = prev.getnext() + if next_sibling is None or is_comment(next_sibling): + _remove_element_preserve_whitespace(root, prev) + def _create_from_source_full( target_xml: Path, translations: Dict[str, str], @@ -884,7 +983,6 @@ def _create_from_source_full( if not name: continue - # Keep non-translatable as-is if elem.get("translatable", "true").lower() == "false": continue @@ -942,6 +1040,8 @@ def _create_from_source_full( for elem in elements_to_remove: _remove_element_preserve_whitespace(root, elem) + _normalize_resource_whitespace(root) + ET.cleanup_namespaces(root) tree = ET.ElementTree(root) tree.write(str(target_xml), encoding="utf-8", @@ -979,6 +1079,18 @@ def _fix_xliff_namespaces_in_file(target_xml: Path) -> None: content ) + content = re.sub( + r'(-->)\s*()\s*( None: See https://github.com/openMF/kmp-project-template/blob/main/LICENSE -->''' - # Add copyright header if missing (check for "Copyright" in a comment) if '