| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- #!/usr/bin/env bash
- set -euo pipefail
- # ts2csv.sh — Convert Qt .ts translation files to CSV
- # Usage examples:
- # ./scripts/ts2csv.sh # convert all in ./translations -> ./translations/csv (2-column)
- # ./scripts/ts2csv.sh -m 1col # single-column CSV of source strings
- # ./scripts/ts2csv.sh -o ./out csv/translations/app_de.ts
- # ./scripts/ts2csv.sh --bom --no-header
- #
- # Flags:
- # -o, --outdir DIR Output directory (default: <repo>/translations/csv)
- # -m, --mode MODE "2col" (default) or "1col"
- # --include-unfinished Include messages with translation type="unfinished" (default: on)
- # --exclude-unfinished Exclude unfinished (default is include)
- # --keep-obsolete Keep messages marked obsolete/vanished (default: skip)
- # --bom Write UTF-8 with BOM (Excel-friendly)
- # --no-header Do not write header row
- # --explode-plurals Instead of joining plural forms, create separate rows
- #
- # Notes:
- # - Language is taken from TS/@language if present; otherwise guessed from filename (e.g. *_de.ts -> de).
- # - Plural forms (numerus) are joined with " | " unless --explode-plurals is used.
- # - Embedded newlines in fields are flattened to the literal two characters "\n" by default.
- # - All CSV fields are quoted (RFC 4180), so commas and quotes inside fields are safe.
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
- REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
- DEFAULT_INDIR="$REPO_ROOT/translations"
- OUTDIR="$REPO_ROOT/translations/csv"
- MODE="2col"
- INCLUDE_UNFINISHED=1
- KEEP_OBSOLETE=0
- WRITE_BOM=0
- WRITE_HEADER=1
- EXPLODE_PLURALS=0
- # Parse args
- ARGS=()
- while [[ $# -gt 0 ]]; do
- case "$1" in
- -o|--outdir) OUTDIR="$2"; shift 2;;
- -m|--mode) MODE="$2"; shift 2;;
- --include-unfinished) INCLUDE_UNFINISHED=1; shift;;
- --exclude-unfinished) INCLUDE_UNFINISHED=0; shift;;
- --keep-obsolete) KEEP_OBSOLETE=1; shift;;
- --bom) WRITE_BOM=1; shift;;
- --no-header) WRITE_HEADER=0; shift;;
- --explode-plurals) EXPLODE_PLURALS=1; shift;;
- -h|--help)
- grep '^# ' "$0" | sed 's/^# //'
- exit 0
- ;;
- *) ARGS+=("$1"); shift;;
- esac
- done
- set -- "${ARGS[@]}"
- mkdir -p "$OUTDIR"
- # If no files passed, use all .ts in default translations dir
- if [[ $# -eq 0 ]]; then
- mapfile -t FILES < <(find "$DEFAULT_INDIR" -maxdepth 1 -type f -name '*.ts' | sort)
- else
- FILES=("$@")
- fi
- if [[ ${#FILES[@]} -eq 0 ]]; then
- echo "No .ts files found. Looked in: $DEFAULT_INDIR" >&2
- exit 1
- fi
- python3 - <<'PY' "$OUTDIR" "$MODE" "$INCLUDE_UNFINISHED" "$KEEP_OBSOLETE" "$WRITE_BOM" "$WRITE_HEADER" "$EXPLODE_PLURALS" "${FILES[@]}"
- import sys, re, csv, xml.etree.ElementTree as ET
- from pathlib import Path
- # Pull configuration from argv (provided by the bash wrapper for simplicity)
- OUTDIR = Path(sys.argv[1])
- MODE = sys.argv[2] # "2col" | "1col"
- INCL_UNFINISHED = sys.argv[3] == "1"
- KEEP_OBSOLETE = sys.argv[4] == "1"
- WRITE_BOM = sys.argv[5] == "1"
- WRITE_HEADER = sys.argv[6] == "1"
- EXPLODE_PLURALS = sys.argv[7] == "1"
- FILES = [Path(p) for p in sys.argv[8:]]
- OUTDIR.mkdir(parents=True, exist_ok=True)
- def guess_lang(ts_path: Path, root: ET.Element) -> str:
- lang = (root.get("language") or "").strip()
- if lang:
- return lang
- # fallback: extract last _xx before .ts
- m = re.search(r'_([A-Za-z]{2,})(?:_[A-Za-z0-9]+)?\.ts$', ts_path.name)
- return (m.group(1) if m else "unknown")
- def should_skip_translation(trans_el: ET.Element) -> bool:
- t = (trans_el.get("type") or "").lower()
- if t in ("obsolete", "vanished"):
- return not KEEP_OBSOLETE
- return False
- def is_unfinished(trans_el: ET.Element) -> bool:
- return (trans_el.get("type") or "").lower() == "unfinished"
- def text_of(elem: ET.Element) -> str:
- # Concatenate text including nested nodes, preserving newlines
- parts = []
- if elem.text:
- parts.append(elem.text)
- for child in elem:
- parts.append(text_of(child))
- if child.tail:
- parts.append(child.tail)
- return "".join(parts)
- def plural_forms(trans_el: ET.Element):
- # Return list of plural form strings if numerus; else single string list
- nufs = list(trans_el.findall("./numerusform"))
- if nufs:
- return [text_of(n).strip() for n in nufs]
- return [(text_of(trans_el) or "").strip()]
- def src_text(msg_el: ET.Element) -> str:
- s = msg_el.find("source")
- return (text_of(s) if s is not None else "").strip()
- def is_obsolete_or_vanished(msg_el: ET.Element) -> bool:
- t = msg_el.find("translation")
- if t is None:
- return False
- typ = (t.get("type") or "").lower()
- return typ in ("obsolete", "vanished")
- def each_message(root: ET.Element):
- for msg in root.findall(".//context/message"):
- yield msg
- def explode_plural_rows(source: str, forms):
- return [(f"{source} [plural {idx}]", val) for idx, val in enumerate(forms)]
- def write_csv(rows, header, out_path: Path, bom=False):
- out_path.parent.mkdir(parents=True, exist_ok=True)
- encoding = "utf-8-sig" if bom else "utf-8"
- def sanitize(s: str) -> str:
- if s is None:
- return ""
- # normalize then flatten newlines to literal "\n"
- s = s.replace("\r\n", "\n").replace("\r", "\n")
- s = s.replace("\n", "\\n")
- return s
- with out_path.open("w", newline="", encoding=encoding) as f:
- # Quote EVERYTHING so commas/newlines/quotes are safe; CRLF for Excel
- w = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator="\r\n")
- if header:
- w.writerow([sanitize(h) for h in header])
- for r in rows:
- w.writerow([sanitize(c) for c in r])
- for ts_path in FILES:
- try:
- tree = ET.parse(ts_path)
- root = tree.getroot()
- except Exception as e:
- print(f"[ERROR] Failed to parse {ts_path}: {e}", file=sys.stderr)
- continue
- lang = guess_lang(ts_path, root)
- rows = []
- if MODE == "1col":
- seen = set()
- for msg in each_message(root):
- if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
- continue
- s = src_text(msg)
- if not s:
- continue
- if s not in seen:
- seen.add(s)
- rows.append([s])
- header = ["source"] if WRITE_HEADER else None
- out_name = ts_path.stem + "_single.csv"
- out_path = OUTDIR / out_name
- write_csv(rows, header, out_path, bom=WRITE_BOM)
- print(f"[OK] {ts_path.name} -> {out_path}")
- continue
- # MODE == "2col"
- for msg in each_message(root):
- if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
- continue
- s = src_text(msg)
- if not s:
- continue
- t_el = msg.find("translation")
- if t_el is None:
- t_forms = [""]
- unfinished = True
- else:
- if should_skip_translation(t_el):
- continue
- t_forms = plural_forms(t_el)
- unfinished = is_unfinished(t_el)
- if not INCL_UNFINISHED and unfinished:
- continue
- if EXPLODE_PLURALS and len(t_forms) > 1:
- rows.extend(explode_plural_rows(s, t_forms))
- else:
- t_joined = " | ".join(t_forms)
- rows.append([s, t_joined])
- header = (["source", f"translation_{lang}"] if WRITE_HEADER else None)
- out_name = ts_path.stem + ".csv"
- out_path = OUTDIR / out_name
- write_csv(rows, header, out_path, bom=WRITE_BOM)
- print(f"[OK] {ts_path.name} -> {out_path}")
- PY
- echo "Done. CSV files are in: $OUTDIR"
|