ts2csv.sh 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. #!/usr/bin/env bash
  2. set -euo pipefail
  3. # ts2csv.sh — Convert Qt .ts translation files to CSV
  4. # Usage examples:
  5. # ./scripts/ts2csv.sh # convert all in ./translations -> ./translations/csv (2-column)
  6. # ./scripts/ts2csv.sh -m 1col # single-column CSV of source strings
  7. # ./scripts/ts2csv.sh -o ./out csv/translations/app_de.ts
  8. # ./scripts/ts2csv.sh --bom --no-header
  9. #
  10. # Flags:
  11. # -o, --outdir DIR Output directory (default: <repo>/translations/csv)
  12. # -m, --mode MODE "2col" (default) or "1col"
  13. # --include-unfinished Include messages with translation type="unfinished" (default: on)
  14. # --exclude-unfinished Exclude unfinished (default is include)
  15. # --keep-obsolete Keep messages marked obsolete/vanished (default: skip)
  16. # --bom Write UTF-8 with BOM (Excel-friendly)
  17. # --no-header Do not write header row
  18. # --explode-plurals Instead of joining plural forms, create separate rows
  19. #
  20. # Notes:
  21. # - Language is taken from TS/@language if present; otherwise guessed from filename (e.g. *_de.ts -> de).
  22. # - Plural forms (numerus) are joined with " | " unless --explode-plurals is used.
  23. # - Embedded newlines in fields are flattened to the literal two characters "\n" by default.
  24. # - All CSV fields are quoted (RFC 4180), so commas and quotes inside fields are safe.
  25. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  26. REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
  27. DEFAULT_INDIR="$REPO_ROOT/translations"
  28. OUTDIR="$REPO_ROOT/translations/csv"
  29. MODE="2col"
  30. INCLUDE_UNFINISHED=1
  31. KEEP_OBSOLETE=0
  32. WRITE_BOM=0
  33. WRITE_HEADER=1
  34. EXPLODE_PLURALS=0
  35. # Parse args
  36. ARGS=()
  37. while [[ $# -gt 0 ]]; do
  38. case "$1" in
  39. -o|--outdir) OUTDIR="$2"; shift 2;;
  40. -m|--mode) MODE="$2"; shift 2;;
  41. --include-unfinished) INCLUDE_UNFINISHED=1; shift;;
  42. --exclude-unfinished) INCLUDE_UNFINISHED=0; shift;;
  43. --keep-obsolete) KEEP_OBSOLETE=1; shift;;
  44. --bom) WRITE_BOM=1; shift;;
  45. --no-header) WRITE_HEADER=0; shift;;
  46. --explode-plurals) EXPLODE_PLURALS=1; shift;;
  47. -h|--help)
  48. grep '^# ' "$0" | sed 's/^# //'
  49. exit 0
  50. ;;
  51. *) ARGS+=("$1"); shift;;
  52. esac
  53. done
  54. set -- "${ARGS[@]}"
  55. mkdir -p "$OUTDIR"
  56. # If no files passed, use all .ts in default translations dir
  57. if [[ $# -eq 0 ]]; then
  58. mapfile -t FILES < <(find "$DEFAULT_INDIR" -maxdepth 1 -type f -name '*.ts' | sort)
  59. else
  60. FILES=("$@")
  61. fi
  62. if [[ ${#FILES[@]} -eq 0 ]]; then
  63. echo "No .ts files found. Looked in: $DEFAULT_INDIR" >&2
  64. exit 1
  65. fi
  66. python3 - <<'PY' "$OUTDIR" "$MODE" "$INCLUDE_UNFINISHED" "$KEEP_OBSOLETE" "$WRITE_BOM" "$WRITE_HEADER" "$EXPLODE_PLURALS" "${FILES[@]}"
  67. import sys, re, csv, xml.etree.ElementTree as ET
  68. from pathlib import Path
  69. # Pull configuration from argv (provided by the bash wrapper for simplicity)
  70. OUTDIR = Path(sys.argv[1])
  71. MODE = sys.argv[2] # "2col" | "1col"
  72. INCL_UNFINISHED = sys.argv[3] == "1"
  73. KEEP_OBSOLETE = sys.argv[4] == "1"
  74. WRITE_BOM = sys.argv[5] == "1"
  75. WRITE_HEADER = sys.argv[6] == "1"
  76. EXPLODE_PLURALS = sys.argv[7] == "1"
  77. FILES = [Path(p) for p in sys.argv[8:]]
  78. OUTDIR.mkdir(parents=True, exist_ok=True)
  79. def guess_lang(ts_path: Path, root: ET.Element) -> str:
  80. lang = (root.get("language") or "").strip()
  81. if lang:
  82. return lang
  83. # fallback: extract last _xx before .ts
  84. m = re.search(r'_([A-Za-z]{2,})(?:_[A-Za-z0-9]+)?\.ts$', ts_path.name)
  85. return (m.group(1) if m else "unknown")
  86. def should_skip_translation(trans_el: ET.Element) -> bool:
  87. t = (trans_el.get("type") or "").lower()
  88. if t in ("obsolete", "vanished"):
  89. return not KEEP_OBSOLETE
  90. return False
  91. def is_unfinished(trans_el: ET.Element) -> bool:
  92. return (trans_el.get("type") or "").lower() == "unfinished"
  93. def text_of(elem: ET.Element) -> str:
  94. # Concatenate text including nested nodes, preserving newlines
  95. parts = []
  96. if elem.text:
  97. parts.append(elem.text)
  98. for child in elem:
  99. parts.append(text_of(child))
  100. if child.tail:
  101. parts.append(child.tail)
  102. return "".join(parts)
  103. def plural_forms(trans_el: ET.Element):
  104. # Return list of plural form strings if numerus; else single string list
  105. nufs = list(trans_el.findall("./numerusform"))
  106. if nufs:
  107. return [text_of(n).strip() for n in nufs]
  108. return [(text_of(trans_el) or "").strip()]
  109. def src_text(msg_el: ET.Element) -> str:
  110. s = msg_el.find("source")
  111. return (text_of(s) if s is not None else "").strip()
  112. def is_obsolete_or_vanished(msg_el: ET.Element) -> bool:
  113. t = msg_el.find("translation")
  114. if t is None:
  115. return False
  116. typ = (t.get("type") or "").lower()
  117. return typ in ("obsolete", "vanished")
  118. def each_message(root: ET.Element):
  119. for msg in root.findall(".//context/message"):
  120. yield msg
  121. def explode_plural_rows(source: str, forms):
  122. return [(f"{source} [plural {idx}]", val) for idx, val in enumerate(forms)]
  123. def write_csv(rows, header, out_path: Path, bom=False):
  124. out_path.parent.mkdir(parents=True, exist_ok=True)
  125. encoding = "utf-8-sig" if bom else "utf-8"
  126. def sanitize(s: str) -> str:
  127. if s is None:
  128. return ""
  129. # normalize then flatten newlines to literal "\n"
  130. s = s.replace("\r\n", "\n").replace("\r", "\n")
  131. s = s.replace("\n", "\\n")
  132. return s
  133. with out_path.open("w", newline="", encoding=encoding) as f:
  134. # Quote EVERYTHING so commas/newlines/quotes are safe; CRLF for Excel
  135. w = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator="\r\n")
  136. if header:
  137. w.writerow([sanitize(h) for h in header])
  138. for r in rows:
  139. w.writerow([sanitize(c) for c in r])
  140. for ts_path in FILES:
  141. try:
  142. tree = ET.parse(ts_path)
  143. root = tree.getroot()
  144. except Exception as e:
  145. print(f"[ERROR] Failed to parse {ts_path}: {e}", file=sys.stderr)
  146. continue
  147. lang = guess_lang(ts_path, root)
  148. rows = []
  149. if MODE == "1col":
  150. seen = set()
  151. for msg in each_message(root):
  152. if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
  153. continue
  154. s = src_text(msg)
  155. if not s:
  156. continue
  157. if s not in seen:
  158. seen.add(s)
  159. rows.append([s])
  160. header = ["source"] if WRITE_HEADER else None
  161. out_name = ts_path.stem + "_single.csv"
  162. out_path = OUTDIR / out_name
  163. write_csv(rows, header, out_path, bom=WRITE_BOM)
  164. print(f"[OK] {ts_path.name} -> {out_path}")
  165. continue
  166. # MODE == "2col"
  167. for msg in each_message(root):
  168. if not KEEP_OBSOLETE and is_obsolete_or_vanished(msg):
  169. continue
  170. s = src_text(msg)
  171. if not s:
  172. continue
  173. t_el = msg.find("translation")
  174. if t_el is None:
  175. t_forms = [""]
  176. unfinished = True
  177. else:
  178. if should_skip_translation(t_el):
  179. continue
  180. t_forms = plural_forms(t_el)
  181. unfinished = is_unfinished(t_el)
  182. if not INCL_UNFINISHED and unfinished:
  183. continue
  184. if EXPLODE_PLURALS and len(t_forms) > 1:
  185. rows.extend(explode_plural_rows(s, t_forms))
  186. else:
  187. t_joined = " | ".join(t_forms)
  188. rows.append([s, t_joined])
  189. header = (["source", f"translation_{lang}"] if WRITE_HEADER else None)
  190. out_name = ts_path.stem + ".csv"
  191. out_path = OUTDIR / out_name
  192. write_csv(rows, header, out_path, bom=WRITE_BOM)
  193. print(f"[OK] {ts_path.name} -> {out_path}")
  194. PY
  195. echo "Done. CSV files are in: $OUTDIR"