cpp
/
Standard-of-Iron
mirror of https://github.com/djeada/Standard-of-Iron.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
							#!/usr/bin/env bash
set -euo pipefail

# ts2csv.sh — Convert Qt .ts translation files to CSV
# Usage examples:
#   ./scripts/ts2csv.sh                 # convert all in ./translations -> ./translations/csv (2-column)
#   ./scripts/ts2csv.sh -m 1col         # single-column CSV of source strings
#   ./scripts/ts2csv.sh -o ./out csv/translations/app_de.ts
#   ./scripts/ts2csv.sh --bom --no-header
#
# Flags:
#   -o, --outdir DIR          Output directory (default: <repo>/translations/csv)
#   -m, --mode MODE           "2col" (default) or "1col"
#       --include-unfinished  Include messages with translation type="unfinished" (default: on)
#       --exclude-unfinished  Exclude unfinished (default is include)
#       --keep-obsolete       Keep messages marked obsolete/vanished (default: skip)
#       --bom                 Write UTF-8 with BOM (Excel-friendly)
#       --no-header           Do not write header row
#       --explode-plurals     Instead of joining plural forms, create separate rows
#
# Notes:
# - Language is taken from TS/@language if present; otherwise guessed from filename (e.g. *_de.ts -> de).
# - Plural forms (numerus) are joined with " | " unless --explode-plurals is used.
# - Embedded newlines in fields are flattened to the literal two characters "\n" by default.
# - All CSV fields are quoted (RFC 4180), so commas and quotes inside fields are safe.

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DEFAULT_INDIR="$REPO_ROOT/translations"
OUTDIR="$REPO_ROOT/translations/csv"

MODE="2col"
INCLUDE_UNFINISHED=1
KEEP_OBSOLETE=0
WRITE_BOM=0
WRITE_HEADER=1
EXPLODE_PLURALS=0

# Parse args
ARGS=()
while [[ $# -gt 0 ]]; do
  case "$1" in
    -o|--outdir) OUTDIR="$2"; shift 2;;
    -m|--mode) MODE="$2"; shift 2;;
    --include-unfinished) INCLUDE_UNFINISHED=1; shift;;
    --exclude-unfinished) INCLUDE_UNFINISHED=0; shift;;
    --keep-obsolete) KEEP_OBSOLETE=1; shift;;
    --bom) WRITE_BOM=1; shift;;
    --no-header) WRITE_HEADER=0; shift;;
    --explode-plurals) EXPLODE_PLURALS=1; shift;;
    -h|--help)
      grep '^# ' "$0" | sed 's/^# //'
      exit 0
      ;;
    *) ARGS+=("$1"); shift;;
  esac
done
set -- "${ARGS[@]}"

mkdir -p "$OUTDIR"

# If no files passed, use all .ts in default translations dir
if [[ $# -eq 0 ]]; then
  mapfile -t FILES < <(find "$DEFAULT_INDIR" -maxdepth 1 -type f -name '*.ts' | sort)
else
  FILES=("$@")
fi

if [[ ${#FILES[@]} -eq 0 ]]; then
  echo "No .ts files found. Looked in: $DEFAULT_INDIR" >&2
  exit 1
fi

python3 - <<'PY' "$OUTDIR" "$MODE" "$INCLUDE_UNFINISHED" "$KEEP_OBSOLETE" "$WRITE_BOM" "$WRITE_HEADER" "$EXPLODE_PLURALS" "${FILES[@]}"
import sys, re, csv, xml.etree.ElementTree as ET
from pathlib import Path

# Pull configuration from argv (provided by the bash wrapper for simplicity)
OUTDIR           = Path(sys.argv[1])
MODE             = sys.argv[2]           # "2col" | "1col"
INCL_UNFINISHED  = sys.argv[3] == "1"
KEEP_OBSOLETE    = sys.argv[4] == "1"
WRITE_BOM        = sys.argv[5] == "1"
WRITE_HEADER     = sys.argv[6] == "1"
EXPLODE_PLURALS  = sys.argv[7] == "1"
FILES            = [Path(p) for p in sys.argv[8:]]

OUTDIR.mkdir(parents=True, exist_ok=True)

def guess_lang(ts_path: Path, root: ET.Element) -> str:
    lang = (root.get("language") or "").strip()
    if lang:
        return lang
    # fallback: extract last _xx before .ts
    m = re.search(r'_([A-Za-z]{2,})(?:_[A-Za-z0-9]+)?\.ts$', ts_path.name)
    return (m.group(1) if m else "unknown")

def should_skip_translation(trans_el: ET.Element) -> bool:
    t = (trans_el.get("type") or "").lower()
    if t in ("obsolete", "vanished"):
        return not KEEP_OBSOLETE
    return False

def is_unfinished(trans_el: ET.Element) -> bool:
    return (trans_el.get("type") or "").lower() == "unfinished"

def text_of(elem: ET.Element) -> str:
    # Concatenate text including nested nodes, preserving newlines
    parts = []
    if elem.text:
        parts.append(elem.text)
    for child in elem:
        parts.append(text_of(child))
        if child.tail:
            parts.append(child.tail)
    return "".join(parts)

def plural_forms(trans_el: ET.Element):
    # Return list of plural form strings if numerus; else single string list
    nufs = list(trans_el.findall("./numerusform"))
    if nufs:
        return [text_of(n).strip() for n in nufs]
    return [(text_of(trans_el) or "").strip()]

def src_text(msg_el: ET.Element) -> str:
    s = msg_el.find("source")
    return (text_of(s) if s is not None else "").strip()

def is_obsolete_or_vanished(msg_el: ET.Element) -> bool:
    t = msg_el.find("translation")
    if t is None:
        return False
    typ = (t.get("type") or "").lower()
    return typ in ("obsolete", "vanished")

def each_message(root: ET.Element):
    for msg in root.findall(".//context/message"):
        yield msg

def explode_plural_rows(source: str, forms):
    return [(f"{source} [plural {idx}]", val) for idx, val in enumerate(forms)]

def write_csv(rows, header, out_path: Path, bom=False):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    encoding = "utf-8-sig" if bom else "utf-8"

    def sanitize(s: str) -> str:
        if s is None:
            return ""
        # normalize then flatten newlines to literal "\n"
        s = s.replace("\r\n", "\n").replace("\r", "\n")
        s = s.replace("\n", "\\n")
        return s

    with out_path.open("w", newline="", encoding=encoding) as f:
        # Quote EVERYTHING so commas/newlines/quotes are safe; CRLF for Excel
        w = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator="\r\n")
        if header:
            w.writerow([sanitize(h) for h in header])
        for r in rows:
            w.writerow([sanitize(c) for c in r])

for ts_path in FILES:
    try:
        tree = ET.parse(ts_path)
        root = tree.getroot()
    except Exception as e:
        print(f"[ERROR] Failed to parse {ts_path}: {e}", file=sys.stderr)
        continue

    lang = guess_lang(ts_path, root)
    rows = []

    if MODE == "1col":
        seen = set()
        for msg in each_message(root):
            if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
                continue
            s = src_text(msg)
            if not s:
                continue
            if s not in seen:
                seen.add(s)
                rows.append([s])
        header = ["source"] if WRITE_HEADER else None
        out_name = ts_path.stem + "_single.csv"
        out_path = OUTDIR / out_name
        write_csv(rows, header, out_path, bom=WRITE_BOM)
        print(f"[OK] {ts_path.name} -> {out_path}")
        continue

    # MODE == "2col"
    for msg in each_message(root):
        if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
            continue
        s = src_text(msg)
        if not s:
            continue
        t_el = msg.find("translation")
        if t_el is None:
            t_forms = [""]
            unfinished = True
        else:
            if should_skip_translation(t_el):
                continue
            t_forms = plural_forms(t_el)
            unfinished = is_unfinished(t_el)

        if not INCL_UNFINISHED and unfinished:
            continue

        if EXPLODE_PLURALS and len(t_forms) > 1:
            rows.extend(explode_plural_rows(s, t_forms))
        else:
            t_joined = " | ".join(t_forms)
            rows.append([s, t_joined])

    header = (["source", f"translation_{lang}"] if WRITE_HEADER else None)
    out_name = ts_path.stem + ".csv"
    out_path = OUTDIR / out_name
    write_csv(rows, header, out_path, bom=WRITE_BOM)
    print(f"[OK] {ts_path.name} -> {out_path}")

PY

echo "Done. CSV files are in: $OUTDIR"