remove-comments.sh 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. #!/usr/bin/env bash
  2. # scripts/remove-comments.sh
  3. # Remove comments from C/C++, Python, and shader source files in-place.
  4. set -Eeuo pipefail
  5. trap 'echo "error: line $LINENO: $BASH_COMMAND" >&2' ERR
  6. EXTS_DEFAULT="c,cc,cpp,cxx,h,hh,hpp,hxx,ipp,inl,tpp,qml,vert,frag,glsl,py"
  7. ROOTS=(".")
  8. DRY_RUN=0
  9. BACKUP=0 # OFF by default
  10. QUIET=0
  11. EXTS="$EXTS_DEFAULT"
  12. usage() {
  13. cat <<'USAGE'
  14. remove-comments.sh - strip comments from C/C++, Python, and shader files.
  15. Usage:
  16. scripts/remove-comments.sh [options] [PATH ...]
  17. Options:
  18. -x, --ext Comma-separated extensions to scan (default: c,cc,cpp,cxx,h,hh,hpp,hxx,ipp,inl,tpp,qml,vert,frag,glsl,py)
  19. -n, --dry-run Show files that would be modified; don't write changes
  20. --backup Create FILE.bak before writing (default: OFF)
  21. -q, --quiet Less output
  22. -h, --help Show this help
  23. Examples:
  24. scripts/remove-comments.sh
  25. scripts/remove-comments.sh --backup src/ include/
  26. scripts/remove-comments.sh -x c,cpp,hpp
  27. scripts/remove-comments.sh assets/shaders/
  28. USAGE
  29. }
  30. log() { (( QUIET == 0 )) && printf '%s\n' "$*"; }
  31. die() { printf 'error: %s\n' "$*" >&2; exit 1; }
  32. # --- arg parsing ---
  33. args=()
  34. while [[ $# -gt 0 ]]; do
  35. case "$1" in
  36. -x|--ext) EXTS="${2:?missing extensions}"; shift 2 ;;
  37. -n|--dry-run) DRY_RUN=1; shift ;;
  38. --backup) BACKUP=1; shift ;;
  39. -q|--quiet) QUIET=1; shift ;;
  40. -h|--help) usage; exit 0 ;;
  41. --) shift; break ;;
  42. -*) die "Unknown option: $1" ;;
  43. *) args+=("$1"); shift ;;
  44. esac
  45. done
  46. ((${#args[@]})) && ROOTS=("${args[@]}")
  47. # Build extension list (portable; no mapfile)
  48. IFS=',' read -r -a EXT_ARR <<< "$EXTS"
  49. ((${#EXT_ARR[@]})) || die "No extensions provided"
  50. # Build find predicates
  51. FIND_NAME=()
  52. for e in "${EXT_ARR[@]}"; do
  53. e="${e#.}"
  54. FIND_NAME+=(-o -iname "*.${e}")
  55. done
  56. FIND_NAME=("${FIND_NAME[@]:1}") # drop leading -o
  57. # Pick Python
  58. if command -v python3 >/dev/null 2>&1; then
  59. PYTHON_BIN=python3
  60. elif command -v python >/dev/null 2>&1; then
  61. PYTHON_BIN=python
  62. else
  63. die "Python is required but not found."
  64. fi
  65. # Python filter as a literal (processes BYTES; preserves UTF-8/Unicode)
  66. PY_FILTER=$(cat <<'PYCODE'
  67. import sys, re, os
  68. # Read raw bytes and operate purely on bytes so UTF-8 (and any other) is preserved.
  69. path = sys.argv[1]
  70. with open(path, 'rb') as f:
  71. data = f.read()
  72. # Detect file type based on extension
  73. is_python = path.lower().endswith('.py')
  74. RAW_PREFIX = re.compile(rb'(?:u8|u|U|L)?R"([^\s()\\]{0,16})\(')
  75. def isspace(b): # b is an int 0..255
  76. return b in b' \t\r\n\v\f'
  77. def strip_cpp_comments(b: bytes) -> bytes:
  78. out = bytearray()
  79. i = 0
  80. n = len(b)
  81. def prev_byte():
  82. return out[-1] if out else None
  83. while i < n:
  84. # C++ raw string?
  85. m = RAW_PREFIX.match(b, i)
  86. if m:
  87. delim = m.group(1)
  88. start = m.end()
  89. end_token = b')' + delim + b'"'
  90. j = b.find(end_token, start)
  91. if j != -1:
  92. out += b[i:j+len(end_token)]
  93. i = j + len(end_token)
  94. continue
  95. c = b[i]
  96. # Regular string / char literals
  97. if c == 0x22 or c == 0x27: # " or '
  98. quote = c
  99. out.append(c); i += 1
  100. while i < n:
  101. ch = b[i]; out.append(ch); i += 1
  102. if ch == 0x5C and i < n: # backslash -> escape next byte verbatim
  103. out.append(b[i]); i += 1
  104. elif ch == quote:
  105. break
  106. continue
  107. # Comments
  108. if c == 0x2F and i + 1 < n: # '/'
  109. nx = b[i+1]
  110. # // line comment
  111. if nx == 0x2F:
  112. i += 2
  113. while i < n and b[i] != 0x0A:
  114. i += 1
  115. if i < n and b[i] == 0x0A:
  116. # Preserve CRLF if present
  117. if i > 0 and b[i-1] == 0x0D:
  118. out += b'\r\n'
  119. else:
  120. out += b'\n'
  121. i += 1
  122. continue
  123. # /* block comment */
  124. if nx == 0x2A:
  125. i += 2
  126. had_nl = False
  127. while i < n - 1:
  128. if b[i] == 0x0A:
  129. had_nl = True
  130. if b[i] == 0x2A and b[i+1] == 0x2F:
  131. i += 2
  132. break
  133. i += 1
  134. # Insert minimal whitespace so tokens don't glue
  135. nextc = b[i] if i < n else None
  136. p = prev_byte()
  137. if had_nl:
  138. if p not in (None, 0x0A, 0x0D):
  139. out.append(0x0A) # '\n'
  140. else:
  141. if p is not None and not isspace(p) and (nextc is not None) and not isspace(nextc):
  142. out.append(0x20) # ' '
  143. continue
  144. # Default: copy byte verbatim (preserves any UTF-8 / binary)
  145. out.append(c); i += 1
  146. return bytes(out)
  147. def strip_python_comments(b: bytes) -> bytes:
  148. out = bytearray()
  149. i = 0
  150. n = len(b)
  151. # Preserve shebang line if present at start of file
  152. if n > 2 and b[0] == 0x23 and b[1] == 0x21: # '#!'
  153. while i < n and b[i] != 0x0A:
  154. out.append(b[i])
  155. i += 1
  156. if i < n and b[i] == 0x0A:
  157. out.append(b[i])
  158. i += 1
  159. while i < n:
  160. c = b[i]
  161. # String literals (single, double, and triple-quoted)
  162. if c == 0x22 or c == 0x27: # " or '
  163. quote = c
  164. # Check for triple-quote
  165. if i + 2 < n and b[i+1] == quote and b[i+2] == quote:
  166. # Triple-quoted string
  167. out.append(c); out.append(c); out.append(c)
  168. i += 3
  169. while i < n:
  170. ch = b[i]
  171. out.append(ch)
  172. i += 1
  173. if ch == 0x5C and i < n: # backslash escape
  174. out.append(b[i])
  175. i += 1
  176. elif ch == quote and i + 1 < n and b[i] == quote and i + 2 < n and b[i+1] == quote:
  177. # Found closing triple-quote
  178. out.append(b[i])
  179. out.append(b[i+1])
  180. i += 2
  181. break
  182. else:
  183. # Single or double quoted string
  184. out.append(c)
  185. i += 1
  186. while i < n:
  187. ch = b[i]
  188. out.append(ch)
  189. i += 1
  190. if ch == 0x5C and i < n: # backslash escape
  191. out.append(b[i])
  192. i += 1
  193. elif ch == quote:
  194. break
  195. elif ch == 0x0A: # newline ends unclosed string
  196. break
  197. continue
  198. # # comment
  199. if c == 0x23: # '#'
  200. i += 1
  201. while i < n and b[i] != 0x0A:
  202. i += 1
  203. if i < n and b[i] == 0x0A:
  204. # Preserve line ending
  205. if i > 0 and b[i-1] == 0x0D:
  206. out += b'\r\n'
  207. else:
  208. out += b'\n'
  209. i += 1
  210. continue
  211. # Default: copy byte verbatim
  212. out.append(c)
  213. i += 1
  214. return bytes(out)
  215. if is_python:
  216. sys.stdout.buffer.write(strip_python_comments(data))
  217. else:
  218. sys.stdout.buffer.write(strip_cpp_comments(data))
  219. PYCODE
  220. )
  221. changed=0
  222. processed=0
  223. process_file() {
  224. local f="$1"
  225. log "processing: $f"
  226. # Capture current file mode (GNU and BSD)
  227. local mode
  228. mode="$(stat -c '%a' "$f" 2>/dev/null || stat -f '%Lp' "$f" 2>/dev/null || echo '')"
  229. # mktemp: handle BSD/GNU differences
  230. local tmp
  231. tmp="$(mktemp 2>/dev/null || mktemp -t rmcomments)" || die "mktemp failed"
  232. # Run the Python filter; keep argv[1] = file path
  233. if ! printf '%s\n' "$PY_FILTER" | "$PYTHON_BIN" - "$f" >"$tmp"; then
  234. rm -f "$tmp"
  235. die "Python filter failed on $f"
  236. fi
  237. if ! cmp -s "$f" "$tmp"; then
  238. if (( DRY_RUN == 1 )); then
  239. echo "would modify: $f"
  240. rm -f "$tmp"
  241. ((processed+=1))
  242. return
  243. fi
  244. if (( BACKUP == 1 )); then
  245. cp -p -- "$f" "$f.bak" 2>/dev/null || cp -p "$f" "$f.bak" || true
  246. fi
  247. # Replace file
  248. mv -- "$tmp" "$f" 2>/dev/null || mv "$tmp" "$f"
  249. # Restore original mode if we captured it
  250. [[ -n "$mode" ]] && chmod "$mode" "$f" 2>/dev/null || true
  251. ((changed+=1))
  252. else
  253. rm -f "$tmp"
  254. fi
  255. ((processed+=1))
  256. }
  257. log "Scanning: ${ROOTS[*]}"
  258. log "Extensions: $EXTS"
  259. (( DRY_RUN )) && log "(dry run)"
  260. # Find files and process
  261. while IFS= read -r -d '' f; do
  262. process_file "$f"
  263. done < <(
  264. find "${ROOTS[@]}" -type f \( "${FIND_NAME[@]}" \) \
  265. -not -path '*/.git/*' -not -path '*/.svn/*' -not -path '*/build/*' -print0
  266. )
  267. if (( DRY_RUN == 1 )); then
  268. echo "dry run complete. processed: $processed file(s); would modify: $changed"
  269. else
  270. echo "done. processed: $processed file(s); modified: $changed"
  271. fi