ts2csv.sh 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. #!/usr/bin/env bash
  2. set -euo pipefail
  3. # ts2csv.sh — Convert Qt .ts translation files to CSV
  4. # Usage examples:
  5. # ./scripts/ts2csv.sh # convert all in ./translations -> ./translations/csv (2-column)
  6. # ./scripts/ts2csv.sh -m 1col # single-column CSV of source strings
  7. # ./scripts/ts2csv.sh -o ./out csv/translations/app_de.ts
  8. # ./scripts/ts2csv.sh --bom --no-header
  9. #
  10. # Flags:
  11. # -o, --outdir DIR Output directory (default: <repo>/translations/csv)
  12. # -m, --mode MODE "2col" (default) or "1col"
  13. # --include-unfinished Include messages with translation type="unfinished" (default: on)
  14. # --exclude-unfinished Exclude unfinished (default is include)
  15. # --keep-obsolete Keep messages marked obsolete/vanished (default: skip)
  16. # --bom Write UTF-8 with BOM (Excel-friendly)
  17. # --no-header Do not write header row
  18. # --explode-plurals Instead of joining plural forms, create separate rows
  19. #
  20. # Notes:
  21. # - Language is taken from TS/@language if present; otherwise guessed from filename (e.g. *_de.ts -> de).
  22. # - Plural forms (numerus) are joined with " | " unless --explode-plurals is used.
  23. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  24. REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
  25. DEFAULT_INDIR="$REPO_ROOT/translations"
  26. OUTDIR="$REPO_ROOT/translations/csv"
  27. MODE="2col"
  28. INCLUDE_UNFINISHED=1
  29. KEEP_OBSOLETE=0
  30. WRITE_BOM=0
  31. WRITE_HEADER=1
  32. EXPLODE_PLURALS=0
  33. # Parse args
  34. ARGS=()
  35. while [[ $# -gt 0 ]]; do
  36. case "$1" in
  37. -o|--outdir) OUTDIR="$2"; shift 2;;
  38. -m|--mode) MODE="$2"; shift 2;;
  39. --include-unfinished) INCLUDE_UNFINISHED=1; shift;;
  40. --exclude-unfinished) INCLUDE_UNFINISHED=0; shift;;
  41. --keep-obsolete) KEEP_OBSOLETE=1; shift;;
  42. --bom) WRITE_BOM=1; shift;;
  43. --no-header) WRITE_HEADER=0; shift;;
  44. --explode-plurals) EXPLODE_PLURALS=1; shift;;
  45. -h|--help)
  46. grep '^# ' "$0" | sed 's/^# //'
  47. exit 0
  48. ;;
  49. *) ARGS+=("$1"); shift;;
  50. esac
  51. done
  52. set -- "${ARGS[@]}"
  53. mkdir -p "$OUTDIR"
  54. # If no files passed, use all .ts in default translations dir
  55. if [[ $# -eq 0 ]]; then
  56. mapfile -t FILES < <(find "$DEFAULT_INDIR" -maxdepth 1 -type f -name '*.ts' | sort)
  57. else
  58. FILES=("$@")
  59. fi
  60. if [[ ${#FILES[@]} -eq 0 ]]; then
  61. echo "No .ts files found. Looked in: $DEFAULT_INDIR" >&2
  62. exit 1
  63. fi
  64. python3 - <<'PY' "$OUTDIR" "$MODE" "$INCLUDE_UNFINISHED" "$KEEP_OBSOLETE" "$WRITE_BOM" "$WRITE_HEADER" "$EXPLODE_PLURALS" "${FILES[@]}"
  65. import sys, os, re, csv, argparse, xml.etree.ElementTree as ET
  66. from pathlib import Path
  67. # Pull configuration from argv (provided by the bash wrapper for simplicity)
  68. OUTDIR = Path(sys.argv[1])
  69. MODE = sys.argv[2] # "2col" | "1col"
  70. INCL_UNFINISHED = sys.argv[3] == "1"
  71. KEEP_OBSOLETE = sys.argv[4] == "1"
  72. WRITE_BOM = sys.argv[5] == "1"
  73. WRITE_HEADER = sys.argv[6] == "1"
  74. EXPLODE_PLURALS = sys.argv[7] == "1"
  75. FILES = [Path(p) for p in sys.argv[8:]]
  76. OUTDIR.mkdir(parents=True, exist_ok=True)
  77. def guess_lang(ts_path: Path, root: ET.Element) -> str:
  78. lang = (root.get("language") or "").strip()
  79. if lang:
  80. return lang
  81. # fallback: extract last _xx before .ts
  82. m = re.search(r'_([A-Za-z]{2,})(?:_[A-Za-z0-9]+)?\.ts$', ts_path.name)
  83. return (m.group(1) if m else "unknown")
  84. def should_skip_translation(trans_el: ET.Element) -> bool:
  85. t = (trans_el.get("type") or "").lower()
  86. if t in ("obsolete", "vanished"):
  87. return not KEEP_OBSOLETE
  88. return False
  89. def is_unfinished(trans_el: ET.Element) -> bool:
  90. return (trans_el.get("type") or "").lower() == "unfinished"
  91. def text_of(elem: ET.Element) -> str:
  92. # Concatenate text including nested nodes, preserving newlines
  93. parts = []
  94. if elem.text:
  95. parts.append(elem.text)
  96. for child in elem:
  97. parts.append(text_of(child))
  98. if child.tail:
  99. parts.append(child.tail)
  100. return "".join(parts)
  101. def plural_forms(trans_el: ET.Element):
  102. # Return list of plural form strings if numerus; else single string list
  103. nufs = list(trans_el.findall("./numerusform"))
  104. if nufs:
  105. return [text_of(n).strip() for n in nufs]
  106. return [ (text_of(trans_el) or "").strip() ]
  107. def src_text(msg_el: ET.Element) -> str:
  108. s = msg_el.find("source")
  109. return (text_of(s) if s is not None else "").strip()
  110. def is_obsolete_or_vanished(msg_el: ET.Element) -> bool:
  111. t = msg_el.find("translation")
  112. if t is None:
  113. return False
  114. typ = (t.get("type") or "").lower()
  115. return typ in ("obsolete","vanished")
  116. def each_message(root: ET.Element):
  117. for msg in root.findall(".//context/message"):
  118. yield msg
  119. def explode_plural_rows(source: str, forms: list[str]):
  120. rows = []
  121. for idx, val in enumerate(forms):
  122. rows.append((f"{source} [plural {idx}]", val))
  123. return rows
  124. def write_csv(rows, header, out_path: Path, bom=False):
  125. out_path.parent.mkdir(parents=True, exist_ok=True)
  126. encoding = "utf-8-sig" if bom else "utf-8"
  127. with out_path.open("w", newline="", encoding=encoding) as f:
  128. w = csv.writer(f) # default excels at quoting; keeps embedded newlines
  129. if header:
  130. w.writerow(header)
  131. for r in rows:
  132. w.writerow(r)
  133. for ts_path in FILES:
  134. try:
  135. tree = ET.parse(ts_path)
  136. root = tree.getroot()
  137. except Exception as e:
  138. print(f"[ERROR] Failed to parse {ts_path}: {e}", file=sys.stderr)
  139. continue
  140. lang = guess_lang(ts_path, root)
  141. rows = []
  142. if MODE == "1col":
  143. seen = set()
  144. for msg in each_message(root):
  145. if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
  146. continue
  147. s = src_text(msg)
  148. if not s:
  149. continue
  150. if s not in seen:
  151. seen.add(s)
  152. rows.append([s])
  153. header = ["source"] if WRITE_HEADER else None
  154. out_name = ts_path.stem + "_single.csv"
  155. out_path = OUTDIR / out_name
  156. write_csv(rows, header, out_path, bom=WRITE_BOM)
  157. print(f"[OK] {ts_path.name} -> {out_path}")
  158. continue
  159. # MODE == "2col"
  160. for msg in each_message(root):
  161. if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
  162. continue
  163. s = src_text(msg)
  164. if not s:
  165. continue
  166. t_el = msg.find("translation")
  167. if t_el is None:
  168. t_forms = [""]
  169. unfinished = True
  170. else:
  171. t_forms = plural_forms(t_el)
  172. unfinished = is_unfinished(t_el)
  173. if not INCL_UNFINISHED and unfinished:
  174. continue
  175. if EXPLODE_PLURALS and len(t_forms) > 1:
  176. rows.extend(explode_plural_rows(s, t_forms))
  177. else:
  178. t_joined = " | ".join(t_forms)
  179. rows.append([s, t_joined])
  180. header = (["source", f"translation_{lang}"] if WRITE_HEADER else None)
  181. out_name = ts_path.stem + ".csv"
  182. out_path = OUTDIR / out_name
  183. write_csv(rows, header, out_path, bom=WRITE_BOM)
  184. print(f"[OK] {ts_path.name} -> {out_path}")
  185. PY
  186. echo "Done. CSV files are in: $OUTDIR"