Browse Source

Enhance CSV processing and plural handling

Refactor CSV processing to handle flattened newlines and improve plural handling.
Adam Djellouli 1 month ago
parent
commit
0437e01196
1 changed files with 103 additions and 128 deletions
  1. 103 128
      scripts/csv2ts.sh

+ 103 - 128
scripts/csv2ts.sh

@@ -16,22 +16,13 @@ set -euo pipefail
 # CSV expectations (auto-detected):
 #   - 2-column with header: "source,<anything>"  (e.g., "translation_de" or "translation")
 #   - 2-column without header: column 1=source, column 2=translation
-#   - Plurals:
-#       - Joined in a single cell: plural forms separated by " | "   (default)
-#       - OR "exploded": create rows with source suffixed by " [plural N]" (0-based)
-#
-# Flags:
-#   -i, --input CSV            Input CSV file
-#   -t, --template TS          Template .ts file to update (existing locale or English)
-#   -o, --outdir DIR           Where to write updated .ts (default: <repo>/translations/updated)
-#   -l, --lang LANG            Override TS @language (e.g. de_DE, fr_FR)
-#       --plural-sep SEP       Separator for joined plural forms (default: " | ")
-#       --inplace              Write changes back to the template .ts path
-#       --backup               When --inplace, create a .bak next to the template
-#       --clear-when-empty     If translation cell empty, set type="unfinished" and clear text (default: on)
-#       --keep-text-when-empty Keep previous text if CSV cell empty (still marks unfinished)
+#   - Plurals in CSV may be:
+#       - Joined in a single cell, forms separated by " | "
+#       - OR "exploded": rows with source suffixed by " [plural N]" (0-based)
 #
 # Notes:
+# - Matches ts2csv defaults: embedded newlines were flattened to "\n" during export;
+#   this script unflattens those back to real newlines by default.
 # - Messages not present in CSV remain unchanged.
 # - If the template message is plural (numerus="yes"), CSV can be joined or exploded; both are supported.
 # - If CSV provides fewer plural forms than exist in template, only provided indices are updated; others kept.
@@ -89,7 +80,7 @@ if [[ $INPLACE -eq 1 && $BACKUP -eq 1 ]]; then
 fi
 
 python3 - <<'PY' "$INPUT_CSV" "$TEMPLATE_TS" "$OUTDIR" "$LANG_OVERRIDE" "$PLURAL_SEP" "$INPLACE" "$CLEAR_WHEN_EMPTY" "$KEEP_TEXT_WHEN_EMPTY"
-import sys, os, re, csv, xml.etree.ElementTree as ET
+import sys, re, csv, xml.etree.ElementTree as ET
 from pathlib import Path
 
 INPUT_CSV = Path(sys.argv[1])
@@ -101,43 +92,44 @@ INPLACE = sys.argv[6] == "1"
 CLEAR_WHEN_EMPTY = sys.argv[7] == "1"
 KEEP_TEXT_WHEN_EMPTY = sys.argv[8] == "1"
 
-def read_csv_map(csv_path: Path):
+def decode_flattened_newlines(s: str) -> str:
+    if s is None:
+        return ""
+    # normalize AND unflatten literal "\n" back to real newlines
+    s = s.replace("\r\n", "\n").replace("\r", "\n")
+    return s.replace("\\n", "\n")
+
+def read_csv_maps(csv_path: Path):
     """
-    Returns:
-      mapping: dict[source] -> list[str] (plural forms) OR single-element list for non-plural
-      exploded_indices: dict[source] -> {idx: text} (if exploded plural rows were found)
+    Returns two maps:
+      raw_map[src] = raw string value from CSV (unflattened newlines)
+      exploded[src] = {index: form_text} for any " [plural N]" rows
+    Joined plurals (value contains PLURAL_SEP) are kept as a single string in raw_map.
+    We only split by PLURAL_SEP later if the target message is actually plural.
     """
-    mapping = {}
-    exploded_indices = {}
-    def add_mapping(src, val):
-        if src not in mapping:
-            mapping[src] = []
-        mapping[src] = [val]  # non-plural default; may be replaced if joined contains multiple
-    def set_plural(src, idx, val):
-        d = exploded_indices.setdefault(src, {})
-        d[int(idx)] = val
+    raw_map: dict[str, str] = {}
+    exploded: dict[str, dict[int, str]] = {}
 
     with csv_path.open("r", newline="", encoding="utf-8-sig") as f:
-        rdr = csv.reader(f)
+        rdr = csv.reader(f)  # handles commas/quotes/multiline properly
         rows = list(rdr)
 
     if not rows:
-        return mapping, exploded_indices
+        return raw_map, exploded
 
     # Detect header
     header = None
-    if any(h.lower() == "source" for h in rows[0]):
+    if any(isinstance(h, str) and h.lower() == "source" for h in rows[0]):
         header = [h.strip() for h in rows[0]]
         rows = rows[1:]
 
     # Column indices
     if header:
         try:
-            source_idx = [i for i,h in enumerate(header) if h.lower()=="source"][0]
+            source_idx = [i for i, h in enumerate(header) if h.lower() == "source"][0]
         except IndexError:
             raise SystemExit("CSV header must contain a 'source' column")
-        # translation column = first non-source column
-        trans_idx_candidates = [i for i,h in enumerate(header) if i != source_idx]
+        trans_idx_candidates = [i for i, h in enumerate(header) if i != source_idx]
         if not trans_idx_candidates:
             raise SystemExit("CSV must have a translation column")
         trans_idx = trans_idx_candidates[0]
@@ -151,40 +143,24 @@ def read_csv_map(csv_path: Path):
     for r in rows:
         if not r:
             continue
-        # pad short rows
         if len(r) <= max(source_idx, trans_idx):
-            r = (r + [""]*(max(source_idx, trans_idx)+1 - len(r)))
+            r = (r + [""] * (max(source_idx, trans_idx) + 1 - len(r)))
         src = (r[source_idx] or "").strip()
         if not src:
             continue
-        val = (r[trans_idx] or "")
+        val = decode_flattened_newlines(r[trans_idx] or "")
 
         m = plural_tag_re.search(src)
         if m:
             base_src = plural_tag_re.sub("", src).rstrip()
-            set_plural(base_src, m.group(1), val)
+            d = exploded.setdefault(base_src, {})
+            d[int(m.group(1))] = val
             continue
 
-        # joined plural forms? split by PLURAL_SEP; keep raw if no sep
-        if PLURAL_SEP in val:
-            forms = [s for s in (p.strip() for p in val.split(PLURAL_SEP))]
-            mapping[src] = forms
-        else:
-            add_mapping(src, val)
-
-    # Merge exploded forms into mapping
-    for base_src, idx_map in exploded_indices.items():
-        max_index = max(idx_map.keys()) if idx_map else -1
-        forms = ["" for _ in range(max_index+1)]
-        for i, v in idx_map.items():
-            if i < 0:
-                continue
-            if i >= len(forms):
-                forms.extend([""] * (i+1-len(forms)))
-            forms[i] = v
-        mapping[base_src] = forms
-
-    return mapping, exploded_indices
+        # store raw (possibly joined) value; don't split here
+        raw_map[src] = val
+
+    return raw_map, exploded
 
 def text_of(elem: ET.Element) -> str:
     parts = []
@@ -202,90 +178,89 @@ def set_text(elem: ET.Element, value: str):
         elem.remove(child)
     elem.text = value
 
-def update_ts(template_ts: Path, mapping: dict, lang_override: str | None):
+def update_ts(template_ts: Path, raw_map: dict, exploded: dict, lang_override: str | None):
     tree = ET.parse(template_ts)
     root = tree.getroot()
 
     if lang_override:
         root.set("language", lang_override)
 
-    # namespace cleanup if present
-    for ctx in root.findall(".//context"):
-        for msg in ctx.findall("message"):
-            source_el = msg.find("source")
-            if source_el is None:
-                continue
-            src = (text_of(source_el) or "").strip()
-            if not src:
-                continue
-
-            numerus = (msg.get("numerus") or "").lower() == "yes"
-            trans_el = msg.find("translation")
-            if trans_el is None:
-                trans_el = ET.SubElement(msg, "translation")
-
-            if src not in mapping:
-                # Not provided in CSV — leave untouched
-                continue
-
-            values = mapping[src]  # list[str] even for non-plural
-            if numerus:
-                # ensure <numerusform> children
-                existing = trans_el.findall("numerusform")
-                if existing:
-                    # Update existing indices; if CSV gives fewer, update only provided
-                    for i, child in enumerate(existing):
-                        if i < len(values):
-                            set_text(child, values[i])
-                    # If CSV provides more, append more nodes
-                    for i in range(len(existing), len(values)):
-                        n = ET.SubElement(trans_el, "numerusform")
-                        set_text(n, values[i])
-                else:
-                    # No children yet; create from CSV
-                    for v in values:
-                        n = ET.SubElement(trans_el, "numerusform")
-                        set_text(n, v)
-
-                # Mark finished/unfinished
-                empty_all = all((v.strip() == "") for v in values)
-                if empty_all:
-                    if CLEAR_WHEN_EMPTY:
-                        # clear all forms
-                        for child in trans_el.findall("numerusform"):
-                            set_text(child, "")
-                    trans_el.set("type", "unfinished")
-                else:
-                    # remove 'type' attribute if present
-                    if "type" in trans_el.attrib:
-                        del trans_el.attrib["type"]
+    for msg in root.findall(".//context/message"):
+        source_el = msg.find("source")
+        if source_el is None:
+            continue
+        src = (text_of(source_el) or "").strip()
+        if not src:
+            continue
 
+        numerus = (msg.get("numerus") or "").lower() == "yes"
+        trans_el = msg.find("translation")
+        if trans_el is None:
+            trans_el = ET.SubElement(msg, "translation")
+
+        if src not in raw_map and src not in exploded:
+            # Not provided in CSV — leave untouched
+            continue
+
+        if numerus:
+            # prefer exploded rows if present
+            if src in exploded:
+                idx_map = exploded[src]
+                max_index = max(idx_map.keys()) if idx_map else -1
+                forms = ["" for _ in range(max_index + 1)]
+                for i, v in idx_map.items():
+                    if i >= 0:
+                        if i >= len(forms):
+                            forms.extend([""] * (i + 1 - len(forms)))
+                        forms[i] = v
+            else:
+                raw_val = raw_map.get(src, "")
+                # only split joined forms if the message is numerus
+                forms = [s.strip() for s in raw_val.split(PLURAL_SEP)] if PLURAL_SEP in raw_val else [raw_val]
+
+            existing = trans_el.findall("numerusform")
+            if existing:
+                for i, child in enumerate(existing):
+                    if i < len(forms):
+                        set_text(child, forms[i])
+                for i in range(len(existing), len(forms)):
+                    n = ET.SubElement(trans_el, "numerusform")
+                    set_text(n, forms[i])
+            else:
+                for v in forms:
+                    n = ET.SubElement(trans_el, "numerusform")
+                    set_text(n, v)
+
+            empty_all = all((v.strip() == "") for v in forms)
+            if empty_all:
+                if CLEAR_WHEN_EMPTY:
+                    for child in trans_el.findall("numerusform"):
+                        set_text(child, "")
+                trans_el.set("type", "unfinished")
+            else:
+                trans_el.attrib.pop("type", None)
+
+        else:
+            # non-plural: take the raw value as-is, even if it contains the plural separator
+            val = raw_map.get(src, "")
+            if val.strip() == "":
+                if CLEAR_WHEN_EMPTY:
+                    set_text(trans_el, "")
+                trans_el.set("type", "unfinished")
             else:
-                val = values[0] if values else ""
-                if val.strip() == "":
-                    if CLEAR_WHEN_EMPTY:
-                        set_text(trans_el, "")
-                    # mark unfinished (or keep text if KEEP_TEXT_WHEN_EMPTY)
-                    trans_el.set("type", "unfinished")
-                else:
-                    set_text(trans_el, val)
-                    if "type" in trans_el.attrib:
-                        del trans_el.attrib["type"]
-
-    return tree, root
+                set_text(trans_el, val)
+                trans_el.attrib.pop("type", None)
+
+    return tree
 
 def write_tree(tree: ET.ElementTree, out_path: Path):
     out_path.parent.mkdir(parents=True, exist_ok=True)
     tree.write(out_path, encoding="utf-8", xml_declaration=True)
 
-mapping, _ = read_csv_map(INPUT_CSV)
-tree, root = update_ts(TEMPLATE_TS, mapping, LANG_OVERRIDE or None)
-
-if INPLACE:
-    out_path = TEMPLATE_TS
-else:
-    out_path = OUTDIR / TEMPLATE_TS.name
+raw_map, exploded = read_csv_maps(INPUT_CSV)
+tree = update_ts(TEMPLATE_TS, raw_map, exploded, LANG_OVERRIDE or None)
 
+out_path = TEMPLATE_TS if INPLACE else (OUTDIR / TEMPLATE_TS.name)
 write_tree(tree, out_path)
 print(f"[OK] Applied {INPUT_CSV.name} -> {out_path}")