Browse Source

Refactor ts2csv.sh for improved readability

Adam Djellouli 1 month ago
parent
commit
a647b7c09c
1 changed files with 31 additions and 20 deletions
  1. 31 20
      scripts/ts2csv.sh

+ 31 - 20
scripts/ts2csv.sh

@@ -21,6 +21,8 @@ set -euo pipefail
 # Notes:
 # - Language is taken from TS/@language if present; otherwise guessed from filename (e.g. *_de.ts -> de).
 # - Plural forms (numerus) are joined with " | " unless --explode-plurals is used.
+# - Embedded newlines in fields are flattened to the literal two characters "\n" by default.
+# - All CSV fields are quoted (RFC 4180), so commas and quotes inside fields are safe.
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
@@ -70,18 +72,18 @@ if [[ ${#FILES[@]} -eq 0 ]]; then
 fi
 
 python3 - <<'PY' "$OUTDIR" "$MODE" "$INCLUDE_UNFINISHED" "$KEEP_OBSOLETE" "$WRITE_BOM" "$WRITE_HEADER" "$EXPLODE_PLURALS" "${FILES[@]}"
-import sys, os, re, csv, argparse, xml.etree.ElementTree as ET
+import sys, re, csv, xml.etree.ElementTree as ET
 from pathlib import Path
 
 # Pull configuration from argv (provided by the bash wrapper for simplicity)
-OUTDIR          = Path(sys.argv[1])
-MODE            = sys.argv[2]           # "2col" | "1col"
-INCL_UNFINISHED = sys.argv[3] == "1"
-KEEP_OBSOLETE   = sys.argv[4] == "1"
-WRITE_BOM       = sys.argv[5] == "1"
-WRITE_HEADER    = sys.argv[6] == "1"
-EXPLODE_PLURALS = sys.argv[7] == "1"
-FILES           = [Path(p) for p in sys.argv[8:]]
+OUTDIR           = Path(sys.argv[1])
+MODE             = sys.argv[2]           # "2col" | "1col"
+INCL_UNFINISHED  = sys.argv[3] == "1"
+KEEP_OBSOLETE    = sys.argv[4] == "1"
+WRITE_BOM        = sys.argv[5] == "1"
+WRITE_HEADER     = sys.argv[6] == "1"
+EXPLODE_PLURALS  = sys.argv[7] == "1"
+FILES            = [Path(p) for p in sys.argv[8:]]
 
 OUTDIR.mkdir(parents=True, exist_ok=True)
 
@@ -118,7 +120,7 @@ def plural_forms(trans_el: ET.Element):
     nufs = list(trans_el.findall("./numerusform"))
     if nufs:
         return [text_of(n).strip() for n in nufs]
-    return [ (text_of(trans_el) or "").strip() ]
+    return [(text_of(trans_el) or "").strip()]
 
 def src_text(msg_el: ET.Element) -> str:
     s = msg_el.find("source")
@@ -126,30 +128,37 @@ def src_text(msg_el: ET.Element) -> str:
 
 def is_obsolete_or_vanished(msg_el: ET.Element) -> bool:
     t = msg_el.find("translation")
-    if t is None: 
+    if t is None:
         return False
     typ = (t.get("type") or "").lower()
-    return typ in ("obsolete","vanished")
+    return typ in ("obsolete", "vanished")
 
 def each_message(root: ET.Element):
     for msg in root.findall(".//context/message"):
         yield msg
 
-def explode_plural_rows(source: str, forms: list[str]):
-    rows = []
-    for idx, val in enumerate(forms):
-        rows.append((f"{source} [plural {idx}]", val))
-    return rows
+def explode_plural_rows(source: str, forms):
+    return [(f"{source} [plural {idx}]", val) for idx, val in enumerate(forms)]
 
 def write_csv(rows, header, out_path: Path, bom=False):
     out_path.parent.mkdir(parents=True, exist_ok=True)
     encoding = "utf-8-sig" if bom else "utf-8"
+
+    def sanitize(s: str) -> str:
+        if s is None:
+            return ""
+        # normalize then flatten newlines to literal "\n"
+        s = s.replace("\r\n", "\n").replace("\r", "\n")
+        s = s.replace("\n", "\\n")
+        return s
+
     with out_path.open("w", newline="", encoding=encoding) as f:
-        w = csv.writer(f)  # default excels at quoting; keeps embedded newlines
+        # Quote EVERYTHING so commas/newlines/quotes are safe; CRLF for Excel
+        w = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator="\r\n")
         if header:
-            w.writerow(header)
+            w.writerow([sanitize(h) for h in header])
         for r in rows:
-            w.writerow(r)
+            w.writerow([sanitize(c) for c in r])
 
 for ts_path in FILES:
     try:
@@ -192,6 +201,8 @@ for ts_path in FILES:
             t_forms = [""]
             unfinished = True
         else:
+            if should_skip_translation(t_el):
+                continue
             t_forms = plural_forms(t_el)
             unfinished = is_unfinished(t_el)