shoco.odin 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /*
  2. Copyright 2022 Jeroen van Rijn <[email protected]>.
  3. Made available under Odin's BSD-3 license.
  4. List of contributors:
  5. Jeroen van Rijn: Initial implementation.
  6. An implementation of [shoco](https://github.com/Ed-von-Schleck/shoco) by Christian Schramm.
  7. */
  8. // package shoco is an implementation of the shoco short string compressor
  9. package shoco
  10. import "core:intrinsics"
  11. import "core:compress"
  12. Shoco_Pack :: struct {
  13. word: u32,
  14. bytes_packed: i8,
  15. bytes_unpacked: i8,
  16. offsets: [8]u16,
  17. masks: [8]i16,
  18. header_mask: u8,
  19. header: u8,
  20. }
  21. Shoco_Model :: struct {
  22. min_char: u8,
  23. max_char: u8,
  24. characters_by_id: []u8,
  25. ids_by_character: [256]i16,
  26. successors_by_bigram: []i8,
  27. successors_reversed: []u8,
  28. character_count: u8,
  29. successor_count: u8,
  30. max_successor_n: i8,
  31. packs: []Shoco_Pack,
  32. }
  33. compress_bound :: proc(uncompressed_size: int) -> (worst_case_compressed_size: int) {
  34. // Worst case compression happens when input is non-ASCII (128-255)
  35. // Encoded as 0x00 + the byte in question.
  36. return uncompressed_size * 2
  37. }
  38. decompress_bound :: proc(compressed_size: int, model := DEFAULT_MODEL) -> (maximum_decompressed_size: int) {
  39. // Best case compression is 2:1
  40. most: f64
  41. for pack in model.packs {
  42. val := f64(compressed_size) / f64(pack.bytes_packed) * f64(pack.bytes_unpacked)
  43. most = max(most, val)
  44. }
  45. return int(most)
  46. }
  47. find_best_encoding :: proc(indices: []i16, n_consecutive: i8, model := DEFAULT_MODEL) -> (res: int) {
  48. for p := len(model.packs); p > 0; p -= 1 {
  49. pack := model.packs[p - 1]
  50. if n_consecutive >= pack.bytes_unpacked {
  51. have_index := true
  52. for i := 0; i < int(pack.bytes_unpacked); i += 1 {
  53. if indices[i] > pack.masks[i] {
  54. have_index = false
  55. break
  56. }
  57. }
  58. if have_index {
  59. return p - 1
  60. }
  61. }
  62. }
  63. return -1
  64. }
  65. validate_model :: proc(model: Shoco_Model) -> (int, compress.Error) {
  66. if len(model.characters_by_id) != int(model.character_count) {
  67. return 0, .Unknown_Compression_Method
  68. }
  69. if len(model.successors_by_bigram) != int(model.character_count) * int(model.character_count) {
  70. return 0, .Unknown_Compression_Method
  71. }
  72. if len(model.successors_reversed) != int(model.successor_count) * int(model.max_char - model.min_char) {
  73. return 0, .Unknown_Compression_Method
  74. }
  75. // Model seems legit.
  76. return 0, nil
  77. }
  78. // Decompresses into provided buffer.
  79. decompress_slice_to_output_buffer :: proc(input: []u8, output: []u8, model := DEFAULT_MODEL) -> (size: int, err: compress.Error) {
  80. inp, inp_end := 0, len(input)
  81. out, out_end := 0, len(output)
  82. validate_model(model) or_return
  83. for inp < inp_end {
  84. val := transmute(i8)input[inp]
  85. mark := int(-1)
  86. for val < 0 {
  87. val <<= 1
  88. mark += 1
  89. }
  90. if mark > len(model.packs) {
  91. return out, .Unknown_Compression_Method
  92. }
  93. if mark < 0 {
  94. if out >= out_end {
  95. return out, .Output_Too_Short
  96. }
  97. // Ignore the sentinel value for non-ASCII chars
  98. if input[inp] == 0x00 {
  99. inp += 1
  100. if inp >= inp_end {
  101. return out, .Stream_Too_Short
  102. }
  103. }
  104. output[out] = input[inp]
  105. inp, out = inp + 1, out + 1
  106. } else {
  107. pack := model.packs[mark]
  108. if out + int(pack.bytes_unpacked) > out_end {
  109. return out, .Output_Too_Short
  110. } else if inp + int(pack.bytes_packed) > inp_end {
  111. return out, .Stream_Too_Short
  112. }
  113. code := intrinsics.unaligned_load((^u32)(&input[inp]))
  114. when ODIN_ENDIAN == .Little {
  115. code = intrinsics.byte_swap(code)
  116. }
  117. // Unpack the leading char
  118. offset := pack.offsets[0]
  119. mask := pack.masks[0]
  120. last_chr := model.characters_by_id[(code >> offset) & u32(mask)]
  121. output[out] = last_chr
  122. // Unpack the successor chars
  123. for i := 1; i < int(pack.bytes_unpacked); i += 1 {
  124. offset = pack.offsets[i]
  125. mask = pack.masks[i]
  126. index_major := u32(last_chr - model.min_char) * u32(model.successor_count)
  127. index_minor := (code >> offset) & u32(mask)
  128. last_chr = model.successors_reversed[index_major + index_minor]
  129. output[out + i] = last_chr
  130. }
  131. out += int(pack.bytes_unpacked)
  132. inp += int(pack.bytes_packed)
  133. }
  134. }
  135. return out, nil
  136. }
  137. decompress_slice_to_string :: proc(input: []u8, model := DEFAULT_MODEL, allocator := context.allocator) -> (res: string, err: compress.Error) {
  138. context.allocator = allocator
  139. if len(input) == 0 {
  140. return "", .Stream_Too_Short
  141. }
  142. max_output_size := decompress_bound(len(input), model)
  143. buf: [dynamic]u8
  144. if !resize(&buf, max_output_size) {
  145. return "", .Out_Of_Memory
  146. }
  147. length, result := decompress_slice_to_output_buffer(input, buf[:])
  148. resize(&buf, length)
  149. return string(buf[:]), result
  150. }
  151. decompress :: proc{decompress_slice_to_output_buffer, decompress_slice_to_string}
  152. compress_string_to_buffer :: proc(input: string, output: []u8, model := DEFAULT_MODEL, allocator := context.allocator) -> (size: int, err: compress.Error) {
  153. inp, inp_end := 0, len(input)
  154. out, out_end := 0, len(output)
  155. output := output
  156. validate_model(model) or_return
  157. indices := make([]i16, model.max_successor_n + 1)
  158. defer delete(indices)
  159. last_resort := false
  160. encode: for inp < inp_end {
  161. if last_resort {
  162. last_resort = false
  163. if input[inp] & 0x80 == 0x80 {
  164. // Non-ASCII case
  165. if out + 2 > out_end {
  166. return out, .Output_Too_Short
  167. }
  168. // Put in a sentinel byte
  169. output[out] = 0x00
  170. out += 1
  171. } else {
  172. // An ASCII byte
  173. if out + 1 > out_end {
  174. return out, .Output_Too_Short
  175. }
  176. }
  177. output[out] = input[inp]
  178. out, inp = out + 1, inp + 1
  179. } else {
  180. // Find the longest string of known successors
  181. indices[0] = model.ids_by_character[input[inp]]
  182. last_chr_index := indices[0]
  183. if last_chr_index < 0 {
  184. last_resort = true
  185. continue encode
  186. }
  187. rest := inp_end - inp
  188. n_consecutive: i8 = 1
  189. for ; n_consecutive <= model.max_successor_n; n_consecutive += 1 {
  190. if inp_end > 0 && int(n_consecutive) == rest {
  191. break
  192. }
  193. current_index := model.ids_by_character[input[inp + int(n_consecutive)]]
  194. if current_index < 0 { // '\0' is always -1
  195. break
  196. }
  197. successor_index := model.successors_by_bigram[last_chr_index * i16(model.character_count) + current_index]
  198. if successor_index < 0 {
  199. break
  200. }
  201. indices[n_consecutive] = i16(successor_index)
  202. last_chr_index = current_index
  203. }
  204. if n_consecutive < 2 {
  205. last_resort = true
  206. continue encode
  207. }
  208. pack_n := find_best_encoding(indices, n_consecutive)
  209. if pack_n >= 0 {
  210. if out + int(model.packs[pack_n].bytes_packed) > out_end {
  211. return out, .Output_Too_Short
  212. }
  213. pack := model.packs[pack_n]
  214. code := pack.word
  215. for i := 0; i < int(pack.bytes_unpacked); i += 1 {
  216. code |= u32(indices[i]) << pack.offsets[i]
  217. }
  218. // In the little-endian world, we need to swap what's in the register to match the memory representation.
  219. when ODIN_ENDIAN == .Little {
  220. code = intrinsics.byte_swap(code)
  221. }
  222. out_ptr := raw_data(output[out:])
  223. switch pack.bytes_packed {
  224. case 4:
  225. intrinsics.unaligned_store(transmute(^u32)out_ptr, code)
  226. case 2:
  227. intrinsics.unaligned_store(transmute(^u16)out_ptr, u16(code))
  228. case 1:
  229. intrinsics.unaligned_store(transmute(^u8)out_ptr, u8(code))
  230. case:
  231. return out, .Unknown_Compression_Method
  232. }
  233. out += int(pack.bytes_packed)
  234. inp += int(pack.bytes_unpacked)
  235. } else {
  236. last_resort = true
  237. continue encode
  238. }
  239. }
  240. }
  241. return out, nil
  242. }
  243. compress_string :: proc(input: string, model := DEFAULT_MODEL, allocator := context.allocator) -> (output: []u8, err: compress.Error) {
  244. context.allocator = allocator
  245. if len(input) == 0 {
  246. return {}, .Stream_Too_Short
  247. }
  248. max_output_size := compress_bound(len(input))
  249. buf: [dynamic]u8
  250. if !resize(&buf, max_output_size) {
  251. return {}, .Out_Of_Memory
  252. }
  253. length, result := compress_string_to_buffer(input, buf[:])
  254. resize(&buf, length)
  255. return buf[:length], result
  256. }
  257. compress :: proc{compress_string_to_buffer, compress_string}