lossless_enc_mips32.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. // Copyright 2015 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MIPS version of lossless functions
  11. //
  12. // Author(s): Djordje Pesut ([email protected])
  13. // Jovan Zelincevic ([email protected])
  14. #include "./dsp.h"
  15. #include "./lossless.h"
  16. #if defined(WEBP_USE_MIPS32)
  17. #include <assert.h>
  18. #include <math.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. static float FastSLog2Slow(uint32_t v) {
  22. assert(v >= LOG_LOOKUP_IDX_MAX);
  23. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  24. uint32_t log_cnt, y, correction;
  25. const int c24 = 24;
  26. const float v_f = (float)v;
  27. uint32_t temp;
  28. // Xf = 256 = 2^8
  29. // log_cnt is index of leading one in upper 24 bits
  30. __asm__ volatile(
  31. "clz %[log_cnt], %[v] \n\t"
  32. "addiu %[y], $zero, 1 \n\t"
  33. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  34. "sllv %[y], %[y], %[log_cnt] \n\t"
  35. "srlv %[temp], %[v], %[log_cnt] \n\t"
  36. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  37. [temp]"=r"(temp)
  38. : [c24]"r"(c24), [v]"r"(v)
  39. );
  40. // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
  41. // Xf = floor(Xf) * (1 + (v % y) / v)
  42. // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
  43. // The correction factor: log(1 + d) ~ d; for very small d values, so
  44. // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
  45. // LOG_2_RECIPROCAL ~ 23/16
  46. // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
  47. correction = (23 * (v & (y - 1))) >> 4;
  48. return v_f * (kLog2Table[temp] + log_cnt) + correction;
  49. } else {
  50. return (float)(LOG_2_RECIPROCAL * v * log((double)v));
  51. }
  52. }
  53. static float FastLog2Slow(uint32_t v) {
  54. assert(v >= LOG_LOOKUP_IDX_MAX);
  55. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  56. uint32_t log_cnt, y;
  57. const int c24 = 24;
  58. double log_2;
  59. uint32_t temp;
  60. __asm__ volatile(
  61. "clz %[log_cnt], %[v] \n\t"
  62. "addiu %[y], $zero, 1 \n\t"
  63. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  64. "sllv %[y], %[y], %[log_cnt] \n\t"
  65. "srlv %[temp], %[v], %[log_cnt] \n\t"
  66. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  67. [temp]"=r"(temp)
  68. : [c24]"r"(c24), [v]"r"(v)
  69. );
  70. log_2 = kLog2Table[temp] + log_cnt;
  71. if (v >= APPROX_LOG_MAX) {
  72. // Since the division is still expensive, add this correction factor only
  73. // for large values of 'v'.
  74. const uint32_t correction = (23 * (v & (y - 1))) >> 4;
  75. log_2 += (double)correction / v;
  76. }
  77. return (float)log_2;
  78. } else {
  79. return (float)(LOG_2_RECIPROCAL * log((double)v));
  80. }
  81. }
  82. // C version of this function:
  83. // int i = 0;
  84. // int64_t cost = 0;
  85. // const uint32_t* pop = &population[4];
  86. // const uint32_t* LoopEnd = &population[length];
  87. // while (pop != LoopEnd) {
  88. // ++i;
  89. // cost += i * *pop;
  90. // cost += i * *(pop + 1);
  91. // pop += 2;
  92. // }
  93. // return (double)cost;
  94. static double ExtraCost(const uint32_t* const population, int length) {
  95. int i, temp0, temp1;
  96. const uint32_t* pop = &population[4];
  97. const uint32_t* const LoopEnd = &population[length];
  98. __asm__ volatile(
  99. "mult $zero, $zero \n\t"
  100. "xor %[i], %[i], %[i] \n\t"
  101. "beq %[pop], %[LoopEnd], 2f \n\t"
  102. "1: \n\t"
  103. "lw %[temp0], 0(%[pop]) \n\t"
  104. "lw %[temp1], 4(%[pop]) \n\t"
  105. "addiu %[i], %[i], 1 \n\t"
  106. "addiu %[pop], %[pop], 8 \n\t"
  107. "madd %[i], %[temp0] \n\t"
  108. "madd %[i], %[temp1] \n\t"
  109. "bne %[pop], %[LoopEnd], 1b \n\t"
  110. "2: \n\t"
  111. "mfhi %[temp0] \n\t"
  112. "mflo %[temp1] \n\t"
  113. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  114. [i]"=&r"(i), [pop]"+r"(pop)
  115. : [LoopEnd]"r"(LoopEnd)
  116. : "memory", "hi", "lo"
  117. );
  118. return (double)((int64_t)temp0 << 32 | temp1);
  119. }
  120. // C version of this function:
  121. // int i = 0;
  122. // int64_t cost = 0;
  123. // const uint32_t* pX = &X[4];
  124. // const uint32_t* pY = &Y[4];
  125. // const uint32_t* LoopEnd = &X[length];
  126. // while (pX != LoopEnd) {
  127. // const uint32_t xy0 = *pX + *pY;
  128. // const uint32_t xy1 = *(pX + 1) + *(pY + 1);
  129. // ++i;
  130. // cost += i * xy0;
  131. // cost += i * xy1;
  132. // pX += 2;
  133. // pY += 2;
  134. // }
  135. // return (double)cost;
  136. static double ExtraCostCombined(const uint32_t* const X,
  137. const uint32_t* const Y, int length) {
  138. int i, temp0, temp1, temp2, temp3;
  139. const uint32_t* pX = &X[4];
  140. const uint32_t* pY = &Y[4];
  141. const uint32_t* const LoopEnd = &X[length];
  142. __asm__ volatile(
  143. "mult $zero, $zero \n\t"
  144. "xor %[i], %[i], %[i] \n\t"
  145. "beq %[pX], %[LoopEnd], 2f \n\t"
  146. "1: \n\t"
  147. "lw %[temp0], 0(%[pX]) \n\t"
  148. "lw %[temp1], 0(%[pY]) \n\t"
  149. "lw %[temp2], 4(%[pX]) \n\t"
  150. "lw %[temp3], 4(%[pY]) \n\t"
  151. "addiu %[i], %[i], 1 \n\t"
  152. "addu %[temp0], %[temp0], %[temp1] \n\t"
  153. "addu %[temp2], %[temp2], %[temp3] \n\t"
  154. "addiu %[pX], %[pX], 8 \n\t"
  155. "addiu %[pY], %[pY], 8 \n\t"
  156. "madd %[i], %[temp0] \n\t"
  157. "madd %[i], %[temp2] \n\t"
  158. "bne %[pX], %[LoopEnd], 1b \n\t"
  159. "2: \n\t"
  160. "mfhi %[temp0] \n\t"
  161. "mflo %[temp1] \n\t"
  162. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  163. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  164. [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
  165. : [LoopEnd]"r"(LoopEnd)
  166. : "memory", "hi", "lo"
  167. );
  168. return (double)((int64_t)temp0 << 32 | temp1);
  169. }
  170. #define HUFFMAN_COST_PASS \
  171. __asm__ volatile( \
  172. "sll %[temp1], %[temp0], 3 \n\t" \
  173. "addiu %[temp3], %[streak], -3 \n\t" \
  174. "addu %[temp2], %[pstreaks], %[temp1] \n\t" \
  175. "blez %[temp3], 1f \n\t" \
  176. "srl %[temp1], %[temp1], 1 \n\t" \
  177. "addu %[temp3], %[pcnts], %[temp1] \n\t" \
  178. "lw %[temp0], 4(%[temp2]) \n\t" \
  179. "lw %[temp1], 0(%[temp3]) \n\t" \
  180. "addu %[temp0], %[temp0], %[streak] \n\t" \
  181. "addiu %[temp1], %[temp1], 1 \n\t" \
  182. "sw %[temp0], 4(%[temp2]) \n\t" \
  183. "sw %[temp1], 0(%[temp3]) \n\t" \
  184. "b 2f \n\t" \
  185. "1: \n\t" \
  186. "lw %[temp0], 0(%[temp2]) \n\t" \
  187. "addu %[temp0], %[temp0], %[streak] \n\t" \
  188. "sw %[temp0], 0(%[temp2]) \n\t" \
  189. "2: \n\t" \
  190. : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
  191. [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
  192. : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
  193. [streak]"r"(streak) \
  194. : "memory" \
  195. );
  196. // Returns the various RLE counts
  197. static WEBP_INLINE void GetEntropyUnrefinedHelper(
  198. uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
  199. VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
  200. int* const pstreaks = &stats->streaks[0][0];
  201. int* const pcnts = &stats->counts[0];
  202. int temp0, temp1, temp2, temp3;
  203. const int streak = i - *i_prev;
  204. // Gather info for the bit entropy.
  205. if (*val_prev != 0) {
  206. bit_entropy->sum += (*val_prev) * streak;
  207. bit_entropy->nonzeros += streak;
  208. bit_entropy->nonzero_code = *i_prev;
  209. bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
  210. if (bit_entropy->max_val < *val_prev) {
  211. bit_entropy->max_val = *val_prev;
  212. }
  213. }
  214. // Gather info for the Huffman cost.
  215. temp0 = (*val_prev != 0);
  216. HUFFMAN_COST_PASS
  217. *val_prev = val;
  218. *i_prev = i;
  219. }
  220. #define ASM_START \
  221. __asm__ volatile( \
  222. ".set push \n\t" \
  223. ".set at \n\t" \
  224. ".set macro \n\t" \
  225. "1: \n\t"
  226. // P2 = P0 + P1
  227. // A..D - offsets
  228. // E - temp variable to tell macro
  229. // if pointer should be incremented
  230. // literal_ and successive histograms could be unaligned
  231. // so we must use ulw and usw
  232. #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
  233. "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
  234. "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
  235. "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
  236. "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
  237. "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
  238. "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
  239. "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
  240. "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
  241. "addu %[temp4], %[temp4], %[temp0] \n\t" \
  242. "addu %[temp5], %[temp5], %[temp1] \n\t" \
  243. "addu %[temp6], %[temp6], %[temp2] \n\t" \
  244. "addu %[temp7], %[temp7], %[temp3] \n\t" \
  245. "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
  246. ".if " #E " == 1 \n\t" \
  247. "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
  248. ".endif \n\t" \
  249. "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
  250. "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
  251. "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
  252. "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
  253. "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
  254. "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
  255. ".set pop \n\t" \
  256. #define ASM_END_COMMON_0 \
  257. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
  258. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
  259. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
  260. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
  261. [pa]"+r"(pa), [pout]"+r"(pout)
  262. #define ASM_END_COMMON_1 \
  263. : [LoopEnd]"r"(LoopEnd) \
  264. : "memory", "at" \
  265. );
  266. #define ASM_END_0 \
  267. ASM_END_COMMON_0 \
  268. , [pb]"+r"(pb) \
  269. ASM_END_COMMON_1
  270. #define ASM_END_1 \
  271. ASM_END_COMMON_0 \
  272. ASM_END_COMMON_1
  273. #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \
  274. const uint32_t* pa = (const uint32_t*)(A); \
  275. const uint32_t* pb = (const uint32_t*)(B); \
  276. uint32_t* pout = (uint32_t*)(OUT); \
  277. const uint32_t* const LoopEnd = pa + (SIZE); \
  278. assert((SIZE) % 4 == 0); \
  279. ASM_START \
  280. ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \
  281. ASM_END_0 \
  282. if ((EXTRA_SIZE) > 0) { \
  283. const int last = (EXTRA_SIZE); \
  284. int i; \
  285. for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
  286. } \
  287. } while (0)
  288. #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \
  289. const uint32_t* pa = (const uint32_t*)(A); \
  290. uint32_t* pout = (uint32_t*)(OUT); \
  291. const uint32_t* const LoopEnd = pa + (SIZE); \
  292. assert((SIZE) % 4 == 0); \
  293. ASM_START \
  294. ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \
  295. ASM_END_1 \
  296. if ((EXTRA_SIZE) > 0) { \
  297. const int last = (EXTRA_SIZE); \
  298. int i; \
  299. for (i = 0; i < last; ++i) pout[i] += pa[i]; \
  300. } \
  301. } while (0)
  302. static void HistogramAdd(const VP8LHistogram* const a,
  303. const VP8LHistogram* const b,
  304. VP8LHistogram* const out) {
  305. uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  306. const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
  307. - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
  308. assert(a->palette_code_bits_ == b->palette_code_bits_);
  309. if (b != out) {
  310. ADD_VECTOR(a->literal_, b->literal_, out->literal_,
  311. NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
  312. ADD_VECTOR(a->distance_, b->distance_, out->distance_,
  313. NUM_DISTANCE_CODES, 0);
  314. ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
  315. ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
  316. ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  317. } else {
  318. ADD_VECTOR_EQ(a->literal_, out->literal_,
  319. NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
  320. ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
  321. ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
  322. ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
  323. ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  324. }
  325. }
  326. #undef ADD_VECTOR_EQ
  327. #undef ADD_VECTOR
  328. #undef ASM_END_1
  329. #undef ASM_END_0
  330. #undef ASM_END_COMMON_1
  331. #undef ASM_END_COMMON_0
  332. #undef ADD_TO_OUT
  333. #undef ASM_START
  334. //------------------------------------------------------------------------------
  335. // Entry point
  336. extern void VP8LEncDspInitMIPS32(void);
  337. WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
  338. VP8LFastSLog2Slow = FastSLog2Slow;
  339. VP8LFastLog2Slow = FastLog2Slow;
  340. VP8LExtraCost = ExtraCost;
  341. VP8LExtraCostCombined = ExtraCostCombined;
  342. VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
  343. VP8LHistogramAdd = HistogramAdd;
  344. }
  345. #else // !WEBP_USE_MIPS32
  346. WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
  347. #endif // WEBP_USE_MIPS32