inv_txfm.c 96 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <math.h>
  11. #include <stdlib.h>
  12. #include <string.h>
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx_dsp/inv_txfm.h"
  15. void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  16. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  17. 0.5 shifts per pixel. */
  18. int i;
  19. tran_low_t output[16];
  20. tran_high_t a1, b1, c1, d1, e1;
  21. const tran_low_t *ip = input;
  22. tran_low_t *op = output;
  23. for (i = 0; i < 4; i++) {
  24. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  25. c1 = ip[1] >> UNIT_QUANT_SHIFT;
  26. d1 = ip[2] >> UNIT_QUANT_SHIFT;
  27. b1 = ip[3] >> UNIT_QUANT_SHIFT;
  28. a1 += c1;
  29. d1 -= b1;
  30. e1 = (a1 - d1) >> 1;
  31. b1 = e1 - b1;
  32. c1 = e1 - c1;
  33. a1 -= b1;
  34. d1 += c1;
  35. op[0] = WRAPLOW(a1);
  36. op[1] = WRAPLOW(b1);
  37. op[2] = WRAPLOW(c1);
  38. op[3] = WRAPLOW(d1);
  39. ip += 4;
  40. op += 4;
  41. }
  42. ip = output;
  43. for (i = 0; i < 4; i++) {
  44. a1 = ip[4 * 0];
  45. c1 = ip[4 * 1];
  46. d1 = ip[4 * 2];
  47. b1 = ip[4 * 3];
  48. a1 += c1;
  49. d1 -= b1;
  50. e1 = (a1 - d1) >> 1;
  51. b1 = e1 - b1;
  52. c1 = e1 - c1;
  53. a1 -= b1;
  54. d1 += c1;
  55. dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
  56. dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
  57. dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
  58. dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
  59. ip++;
  60. dest++;
  61. }
  62. }
  63. void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
  64. int i;
  65. tran_high_t a1, e1;
  66. tran_low_t tmp[4];
  67. const tran_low_t *ip = in;
  68. tran_low_t *op = tmp;
  69. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  70. e1 = a1 >> 1;
  71. a1 -= e1;
  72. op[0] = WRAPLOW(a1);
  73. op[1] = op[2] = op[3] = WRAPLOW(e1);
  74. ip = tmp;
  75. for (i = 0; i < 4; i++) {
  76. e1 = ip[0] >> 1;
  77. a1 = ip[0] - e1;
  78. dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
  79. dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
  80. dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
  81. dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
  82. ip++;
  83. dest++;
  84. }
  85. }
  86. void iadst4_c(const tran_low_t *input, tran_low_t *output) {
  87. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  88. tran_low_t x0 = input[0];
  89. tran_low_t x1 = input[1];
  90. tran_low_t x2 = input[2];
  91. tran_low_t x3 = input[3];
  92. if (!(x0 | x1 | x2 | x3)) {
  93. memset(output, 0, 4 * sizeof(*output));
  94. return;
  95. }
  96. // 32-bit result is enough for the following multiplications.
  97. s0 = sinpi_1_9 * x0;
  98. s1 = sinpi_2_9 * x0;
  99. s2 = sinpi_3_9 * x1;
  100. s3 = sinpi_4_9 * x2;
  101. s4 = sinpi_1_9 * x2;
  102. s5 = sinpi_2_9 * x3;
  103. s6 = sinpi_4_9 * x3;
  104. s7 = WRAPLOW(x0 - x2 + x3);
  105. s0 = s0 + s3 + s5;
  106. s1 = s1 - s4 - s6;
  107. s3 = s2;
  108. s2 = sinpi_3_9 * s7;
  109. // 1-D transform scaling factor is sqrt(2).
  110. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  111. // + 1b (addition) = 29b.
  112. // Hence the output bit depth is 15b.
  113. output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
  114. output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
  115. output[2] = WRAPLOW(dct_const_round_shift(s2));
  116. output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
  117. }
  118. void idct4_c(const tran_low_t *input, tran_low_t *output) {
  119. tran_low_t step[4];
  120. tran_high_t temp1, temp2;
  121. // stage 1
  122. temp1 = (input[0] + input[2]) * cospi_16_64;
  123. temp2 = (input[0] - input[2]) * cospi_16_64;
  124. step[0] = WRAPLOW(dct_const_round_shift(temp1));
  125. step[1] = WRAPLOW(dct_const_round_shift(temp2));
  126. temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
  127. temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
  128. step[2] = WRAPLOW(dct_const_round_shift(temp1));
  129. step[3] = WRAPLOW(dct_const_round_shift(temp2));
  130. // stage 2
  131. output[0] = WRAPLOW(step[0] + step[3]);
  132. output[1] = WRAPLOW(step[1] + step[2]);
  133. output[2] = WRAPLOW(step[1] - step[2]);
  134. output[3] = WRAPLOW(step[0] - step[3]);
  135. }
  136. void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  137. int i, j;
  138. tran_low_t out[4 * 4];
  139. tran_low_t *outptr = out;
  140. tran_low_t temp_in[4], temp_out[4];
  141. // Rows
  142. for (i = 0; i < 4; ++i) {
  143. idct4_c(input, outptr);
  144. input += 4;
  145. outptr += 4;
  146. }
  147. // Columns
  148. for (i = 0; i < 4; ++i) {
  149. for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
  150. idct4_c(temp_in, temp_out);
  151. for (j = 0; j < 4; ++j) {
  152. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  153. ROUND_POWER_OF_TWO(temp_out[j], 4));
  154. }
  155. }
  156. }
  157. void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  158. int i;
  159. tran_high_t a1;
  160. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  161. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  162. a1 = ROUND_POWER_OF_TWO(out, 4);
  163. for (i = 0; i < 4; i++) {
  164. dest[0] = clip_pixel_add(dest[0], a1);
  165. dest[1] = clip_pixel_add(dest[1], a1);
  166. dest[2] = clip_pixel_add(dest[2], a1);
  167. dest[3] = clip_pixel_add(dest[3], a1);
  168. dest += stride;
  169. }
  170. }
  171. void iadst8_c(const tran_low_t *input, tran_low_t *output) {
  172. int s0, s1, s2, s3, s4, s5, s6, s7;
  173. tran_high_t x0 = input[7];
  174. tran_high_t x1 = input[0];
  175. tran_high_t x2 = input[5];
  176. tran_high_t x3 = input[2];
  177. tran_high_t x4 = input[3];
  178. tran_high_t x5 = input[4];
  179. tran_high_t x6 = input[1];
  180. tran_high_t x7 = input[6];
  181. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
  182. memset(output, 0, 8 * sizeof(*output));
  183. return;
  184. }
  185. // stage 1
  186. s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
  187. s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
  188. s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
  189. s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
  190. s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
  191. s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
  192. s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
  193. s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
  194. x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
  195. x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
  196. x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
  197. x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
  198. x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
  199. x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
  200. x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
  201. x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
  202. // stage 2
  203. s0 = (int)x0;
  204. s1 = (int)x1;
  205. s2 = (int)x2;
  206. s3 = (int)x3;
  207. s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
  208. s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
  209. s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
  210. s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
  211. x0 = WRAPLOW(s0 + s2);
  212. x1 = WRAPLOW(s1 + s3);
  213. x2 = WRAPLOW(s0 - s2);
  214. x3 = WRAPLOW(s1 - s3);
  215. x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
  216. x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
  217. x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
  218. x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
  219. // stage 3
  220. s2 = (int)(cospi_16_64 * (x2 + x3));
  221. s3 = (int)(cospi_16_64 * (x2 - x3));
  222. s6 = (int)(cospi_16_64 * (x6 + x7));
  223. s7 = (int)(cospi_16_64 * (x6 - x7));
  224. x2 = WRAPLOW(dct_const_round_shift(s2));
  225. x3 = WRAPLOW(dct_const_round_shift(s3));
  226. x6 = WRAPLOW(dct_const_round_shift(s6));
  227. x7 = WRAPLOW(dct_const_round_shift(s7));
  228. output[0] = WRAPLOW(x0);
  229. output[1] = WRAPLOW(-x4);
  230. output[2] = WRAPLOW(x6);
  231. output[3] = WRAPLOW(-x2);
  232. output[4] = WRAPLOW(x3);
  233. output[5] = WRAPLOW(-x7);
  234. output[6] = WRAPLOW(x5);
  235. output[7] = WRAPLOW(-x1);
  236. }
  237. void idct8_c(const tran_low_t *input, tran_low_t *output) {
  238. tran_low_t step1[8], step2[8];
  239. tran_high_t temp1, temp2;
  240. // stage 1
  241. step1[0] = input[0];
  242. step1[2] = input[4];
  243. step1[1] = input[2];
  244. step1[3] = input[6];
  245. temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
  246. temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
  247. step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  248. step1[7] = WRAPLOW(dct_const_round_shift(temp2));
  249. temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
  250. temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
  251. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  252. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  253. // stage 2
  254. temp1 = (step1[0] + step1[2]) * cospi_16_64;
  255. temp2 = (step1[0] - step1[2]) * cospi_16_64;
  256. step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  257. step2[1] = WRAPLOW(dct_const_round_shift(temp2));
  258. temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
  259. temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
  260. step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  261. step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  262. step2[4] = WRAPLOW(step1[4] + step1[5]);
  263. step2[5] = WRAPLOW(step1[4] - step1[5]);
  264. step2[6] = WRAPLOW(-step1[6] + step1[7]);
  265. step2[7] = WRAPLOW(step1[6] + step1[7]);
  266. // stage 3
  267. step1[0] = WRAPLOW(step2[0] + step2[3]);
  268. step1[1] = WRAPLOW(step2[1] + step2[2]);
  269. step1[2] = WRAPLOW(step2[1] - step2[2]);
  270. step1[3] = WRAPLOW(step2[0] - step2[3]);
  271. step1[4] = step2[4];
  272. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  273. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  274. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  275. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  276. step1[7] = step2[7];
  277. // stage 4
  278. output[0] = WRAPLOW(step1[0] + step1[7]);
  279. output[1] = WRAPLOW(step1[1] + step1[6]);
  280. output[2] = WRAPLOW(step1[2] + step1[5]);
  281. output[3] = WRAPLOW(step1[3] + step1[4]);
  282. output[4] = WRAPLOW(step1[3] - step1[4]);
  283. output[5] = WRAPLOW(step1[2] - step1[5]);
  284. output[6] = WRAPLOW(step1[1] - step1[6]);
  285. output[7] = WRAPLOW(step1[0] - step1[7]);
  286. }
  287. void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  288. int i, j;
  289. tran_low_t out[8 * 8];
  290. tran_low_t *outptr = out;
  291. tran_low_t temp_in[8], temp_out[8];
  292. // First transform rows
  293. for (i = 0; i < 8; ++i) {
  294. idct8_c(input, outptr);
  295. input += 8;
  296. outptr += 8;
  297. }
  298. // Then transform columns
  299. for (i = 0; i < 8; ++i) {
  300. for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
  301. idct8_c(temp_in, temp_out);
  302. for (j = 0; j < 8; ++j) {
  303. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  304. ROUND_POWER_OF_TWO(temp_out[j], 5));
  305. }
  306. }
  307. }
  308. void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  309. int i, j;
  310. tran_low_t out[8 * 8] = { 0 };
  311. tran_low_t *outptr = out;
  312. tran_low_t temp_in[8], temp_out[8];
  313. // First transform rows
  314. // Only first 4 row has non-zero coefs
  315. for (i = 0; i < 4; ++i) {
  316. idct8_c(input, outptr);
  317. input += 8;
  318. outptr += 8;
  319. }
  320. // Then transform columns
  321. for (i = 0; i < 8; ++i) {
  322. for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
  323. idct8_c(temp_in, temp_out);
  324. for (j = 0; j < 8; ++j) {
  325. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  326. ROUND_POWER_OF_TWO(temp_out[j], 5));
  327. }
  328. }
  329. }
  330. void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  331. int i, j;
  332. tran_high_t a1;
  333. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  334. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  335. a1 = ROUND_POWER_OF_TWO(out, 5);
  336. for (j = 0; j < 8; ++j) {
  337. for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
  338. dest += stride;
  339. }
  340. }
  341. void iadst16_c(const tran_low_t *input, tran_low_t *output) {
  342. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  343. tran_high_t s9, s10, s11, s12, s13, s14, s15;
  344. tran_high_t x0 = input[15];
  345. tran_high_t x1 = input[0];
  346. tran_high_t x2 = input[13];
  347. tran_high_t x3 = input[2];
  348. tran_high_t x4 = input[11];
  349. tran_high_t x5 = input[4];
  350. tran_high_t x6 = input[9];
  351. tran_high_t x7 = input[6];
  352. tran_high_t x8 = input[7];
  353. tran_high_t x9 = input[8];
  354. tran_high_t x10 = input[5];
  355. tran_high_t x11 = input[10];
  356. tran_high_t x12 = input[3];
  357. tran_high_t x13 = input[12];
  358. tran_high_t x14 = input[1];
  359. tran_high_t x15 = input[14];
  360. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
  361. x13 | x14 | x15)) {
  362. memset(output, 0, 16 * sizeof(*output));
  363. return;
  364. }
  365. // stage 1
  366. s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
  367. s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  368. s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
  369. s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  370. s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
  371. s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  372. s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  373. s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  374. s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  375. s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  376. s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  377. s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  378. s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  379. s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
  380. s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  381. s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
  382. x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
  383. x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
  384. x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
  385. x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
  386. x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
  387. x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
  388. x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
  389. x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
  390. x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
  391. x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
  392. x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
  393. x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
  394. x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
  395. x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
  396. x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
  397. x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
  398. // stage 2
  399. s0 = x0;
  400. s1 = x1;
  401. s2 = x2;
  402. s3 = x3;
  403. s4 = x4;
  404. s5 = x5;
  405. s6 = x6;
  406. s7 = x7;
  407. s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  408. s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  409. s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  410. s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  411. s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
  412. s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  413. s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
  414. s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
  415. x0 = WRAPLOW(s0 + s4);
  416. x1 = WRAPLOW(s1 + s5);
  417. x2 = WRAPLOW(s2 + s6);
  418. x3 = WRAPLOW(s3 + s7);
  419. x4 = WRAPLOW(s0 - s4);
  420. x5 = WRAPLOW(s1 - s5);
  421. x6 = WRAPLOW(s2 - s6);
  422. x7 = WRAPLOW(s3 - s7);
  423. x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
  424. x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
  425. x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
  426. x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
  427. x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
  428. x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
  429. x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
  430. x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
  431. // stage 3
  432. s0 = x0;
  433. s1 = x1;
  434. s2 = x2;
  435. s3 = x3;
  436. s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
  437. s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  438. s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
  439. s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
  440. s8 = x8;
  441. s9 = x9;
  442. s10 = x10;
  443. s11 = x11;
  444. s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
  445. s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  446. s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
  447. s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
  448. x0 = WRAPLOW(s0 + s2);
  449. x1 = WRAPLOW(s1 + s3);
  450. x2 = WRAPLOW(s0 - s2);
  451. x3 = WRAPLOW(s1 - s3);
  452. x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
  453. x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
  454. x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
  455. x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
  456. x8 = WRAPLOW(s8 + s10);
  457. x9 = WRAPLOW(s9 + s11);
  458. x10 = WRAPLOW(s8 - s10);
  459. x11 = WRAPLOW(s9 - s11);
  460. x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
  461. x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
  462. x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
  463. x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
  464. // stage 4
  465. s2 = (-cospi_16_64) * (x2 + x3);
  466. s3 = cospi_16_64 * (x2 - x3);
  467. s6 = cospi_16_64 * (x6 + x7);
  468. s7 = cospi_16_64 * (-x6 + x7);
  469. s10 = cospi_16_64 * (x10 + x11);
  470. s11 = cospi_16_64 * (-x10 + x11);
  471. s14 = (-cospi_16_64) * (x14 + x15);
  472. s15 = cospi_16_64 * (x14 - x15);
  473. x2 = WRAPLOW(dct_const_round_shift(s2));
  474. x3 = WRAPLOW(dct_const_round_shift(s3));
  475. x6 = WRAPLOW(dct_const_round_shift(s6));
  476. x7 = WRAPLOW(dct_const_round_shift(s7));
  477. x10 = WRAPLOW(dct_const_round_shift(s10));
  478. x11 = WRAPLOW(dct_const_round_shift(s11));
  479. x14 = WRAPLOW(dct_const_round_shift(s14));
  480. x15 = WRAPLOW(dct_const_round_shift(s15));
  481. output[0] = WRAPLOW(x0);
  482. output[1] = WRAPLOW(-x8);
  483. output[2] = WRAPLOW(x12);
  484. output[3] = WRAPLOW(-x4);
  485. output[4] = WRAPLOW(x6);
  486. output[5] = WRAPLOW(x14);
  487. output[6] = WRAPLOW(x10);
  488. output[7] = WRAPLOW(x2);
  489. output[8] = WRAPLOW(x3);
  490. output[9] = WRAPLOW(x11);
  491. output[10] = WRAPLOW(x15);
  492. output[11] = WRAPLOW(x7);
  493. output[12] = WRAPLOW(x5);
  494. output[13] = WRAPLOW(-x13);
  495. output[14] = WRAPLOW(x9);
  496. output[15] = WRAPLOW(-x1);
  497. }
  498. void idct16_c(const tran_low_t *input, tran_low_t *output) {
  499. tran_low_t step1[16], step2[16];
  500. tran_high_t temp1, temp2;
  501. // stage 1
  502. step1[0] = input[0 / 2];
  503. step1[1] = input[16 / 2];
  504. step1[2] = input[8 / 2];
  505. step1[3] = input[24 / 2];
  506. step1[4] = input[4 / 2];
  507. step1[5] = input[20 / 2];
  508. step1[6] = input[12 / 2];
  509. step1[7] = input[28 / 2];
  510. step1[8] = input[2 / 2];
  511. step1[9] = input[18 / 2];
  512. step1[10] = input[10 / 2];
  513. step1[11] = input[26 / 2];
  514. step1[12] = input[6 / 2];
  515. step1[13] = input[22 / 2];
  516. step1[14] = input[14 / 2];
  517. step1[15] = input[30 / 2];
  518. // stage 2
  519. step2[0] = step1[0];
  520. step2[1] = step1[1];
  521. step2[2] = step1[2];
  522. step2[3] = step1[3];
  523. step2[4] = step1[4];
  524. step2[5] = step1[5];
  525. step2[6] = step1[6];
  526. step2[7] = step1[7];
  527. temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  528. temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
  529. step2[8] = WRAPLOW(dct_const_round_shift(temp1));
  530. step2[15] = WRAPLOW(dct_const_round_shift(temp2));
  531. temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  532. temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
  533. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  534. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  535. temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  536. temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
  537. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  538. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  539. temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  540. temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
  541. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  542. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  543. // stage 3
  544. step1[0] = step2[0];
  545. step1[1] = step2[1];
  546. step1[2] = step2[2];
  547. step1[3] = step2[3];
  548. temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  549. temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  550. step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  551. step1[7] = WRAPLOW(dct_const_round_shift(temp2));
  552. temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  553. temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  554. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  555. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  556. step1[8] = WRAPLOW(step2[8] + step2[9]);
  557. step1[9] = WRAPLOW(step2[8] - step2[9]);
  558. step1[10] = WRAPLOW(-step2[10] + step2[11]);
  559. step1[11] = WRAPLOW(step2[10] + step2[11]);
  560. step1[12] = WRAPLOW(step2[12] + step2[13]);
  561. step1[13] = WRAPLOW(step2[12] - step2[13]);
  562. step1[14] = WRAPLOW(-step2[14] + step2[15]);
  563. step1[15] = WRAPLOW(step2[14] + step2[15]);
  564. // stage 4
  565. temp1 = (step1[0] + step1[1]) * cospi_16_64;
  566. temp2 = (step1[0] - step1[1]) * cospi_16_64;
  567. step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  568. step2[1] = WRAPLOW(dct_const_round_shift(temp2));
  569. temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  570. temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  571. step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  572. step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  573. step2[4] = WRAPLOW(step1[4] + step1[5]);
  574. step2[5] = WRAPLOW(step1[4] - step1[5]);
  575. step2[6] = WRAPLOW(-step1[6] + step1[7]);
  576. step2[7] = WRAPLOW(step1[6] + step1[7]);
  577. step2[8] = step1[8];
  578. step2[15] = step1[15];
  579. temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  580. temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  581. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  582. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  583. temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  584. temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  585. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  586. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  587. step2[11] = step1[11];
  588. step2[12] = step1[12];
  589. // stage 5
  590. step1[0] = WRAPLOW(step2[0] + step2[3]);
  591. step1[1] = WRAPLOW(step2[1] + step2[2]);
  592. step1[2] = WRAPLOW(step2[1] - step2[2]);
  593. step1[3] = WRAPLOW(step2[0] - step2[3]);
  594. step1[4] = step2[4];
  595. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  596. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  597. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  598. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  599. step1[7] = step2[7];
  600. step1[8] = WRAPLOW(step2[8] + step2[11]);
  601. step1[9] = WRAPLOW(step2[9] + step2[10]);
  602. step1[10] = WRAPLOW(step2[9] - step2[10]);
  603. step1[11] = WRAPLOW(step2[8] - step2[11]);
  604. step1[12] = WRAPLOW(-step2[12] + step2[15]);
  605. step1[13] = WRAPLOW(-step2[13] + step2[14]);
  606. step1[14] = WRAPLOW(step2[13] + step2[14]);
  607. step1[15] = WRAPLOW(step2[12] + step2[15]);
  608. // stage 6
  609. step2[0] = WRAPLOW(step1[0] + step1[7]);
  610. step2[1] = WRAPLOW(step1[1] + step1[6]);
  611. step2[2] = WRAPLOW(step1[2] + step1[5]);
  612. step2[3] = WRAPLOW(step1[3] + step1[4]);
  613. step2[4] = WRAPLOW(step1[3] - step1[4]);
  614. step2[5] = WRAPLOW(step1[2] - step1[5]);
  615. step2[6] = WRAPLOW(step1[1] - step1[6]);
  616. step2[7] = WRAPLOW(step1[0] - step1[7]);
  617. step2[8] = step1[8];
  618. step2[9] = step1[9];
  619. temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  620. temp2 = (step1[10] + step1[13]) * cospi_16_64;
  621. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  622. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  623. temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  624. temp2 = (step1[11] + step1[12]) * cospi_16_64;
  625. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  626. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  627. step2[14] = step1[14];
  628. step2[15] = step1[15];
  629. // stage 7
  630. output[0] = WRAPLOW(step2[0] + step2[15]);
  631. output[1] = WRAPLOW(step2[1] + step2[14]);
  632. output[2] = WRAPLOW(step2[2] + step2[13]);
  633. output[3] = WRAPLOW(step2[3] + step2[12]);
  634. output[4] = WRAPLOW(step2[4] + step2[11]);
  635. output[5] = WRAPLOW(step2[5] + step2[10]);
  636. output[6] = WRAPLOW(step2[6] + step2[9]);
  637. output[7] = WRAPLOW(step2[7] + step2[8]);
  638. output[8] = WRAPLOW(step2[7] - step2[8]);
  639. output[9] = WRAPLOW(step2[6] - step2[9]);
  640. output[10] = WRAPLOW(step2[5] - step2[10]);
  641. output[11] = WRAPLOW(step2[4] - step2[11]);
  642. output[12] = WRAPLOW(step2[3] - step2[12]);
  643. output[13] = WRAPLOW(step2[2] - step2[13]);
  644. output[14] = WRAPLOW(step2[1] - step2[14]);
  645. output[15] = WRAPLOW(step2[0] - step2[15]);
  646. }
  647. void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
  648. int stride) {
  649. int i, j;
  650. tran_low_t out[16 * 16];
  651. tran_low_t *outptr = out;
  652. tran_low_t temp_in[16], temp_out[16];
  653. // First transform rows
  654. for (i = 0; i < 16; ++i) {
  655. idct16_c(input, outptr);
  656. input += 16;
  657. outptr += 16;
  658. }
  659. // Then transform columns
  660. for (i = 0; i < 16; ++i) {
  661. for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
  662. idct16_c(temp_in, temp_out);
  663. for (j = 0; j < 16; ++j) {
  664. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  665. ROUND_POWER_OF_TWO(temp_out[j], 6));
  666. }
  667. }
  668. }
  669. void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
  670. int stride) {
  671. int i, j;
  672. tran_low_t out[16 * 16] = { 0 };
  673. tran_low_t *outptr = out;
  674. tran_low_t temp_in[16], temp_out[16];
  675. // First transform rows. Since all non-zero dct coefficients are in
  676. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  677. for (i = 0; i < 8; ++i) {
  678. idct16_c(input, outptr);
  679. input += 16;
  680. outptr += 16;
  681. }
  682. // Then transform columns
  683. for (i = 0; i < 16; ++i) {
  684. for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
  685. idct16_c(temp_in, temp_out);
  686. for (j = 0; j < 16; ++j) {
  687. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  688. ROUND_POWER_OF_TWO(temp_out[j], 6));
  689. }
  690. }
  691. }
  692. void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
  693. int stride) {
  694. int i, j;
  695. tran_low_t out[16 * 16] = { 0 };
  696. tran_low_t *outptr = out;
  697. tran_low_t temp_in[16], temp_out[16];
  698. // First transform rows. Since all non-zero dct coefficients are in
  699. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  700. for (i = 0; i < 4; ++i) {
  701. idct16_c(input, outptr);
  702. input += 16;
  703. outptr += 16;
  704. }
  705. // Then transform columns
  706. for (i = 0; i < 16; ++i) {
  707. for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
  708. idct16_c(temp_in, temp_out);
  709. for (j = 0; j < 16; ++j) {
  710. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  711. ROUND_POWER_OF_TWO(temp_out[j], 6));
  712. }
  713. }
  714. }
  715. void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  716. int i, j;
  717. tran_high_t a1;
  718. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  719. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  720. a1 = ROUND_POWER_OF_TWO(out, 6);
  721. for (j = 0; j < 16; ++j) {
  722. for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
  723. dest += stride;
  724. }
  725. }
  726. void idct32_c(const tran_low_t *input, tran_low_t *output) {
  727. tran_low_t step1[32], step2[32];
  728. tran_high_t temp1, temp2;
  729. // stage 1
  730. step1[0] = input[0];
  731. step1[1] = input[16];
  732. step1[2] = input[8];
  733. step1[3] = input[24];
  734. step1[4] = input[4];
  735. step1[5] = input[20];
  736. step1[6] = input[12];
  737. step1[7] = input[28];
  738. step1[8] = input[2];
  739. step1[9] = input[18];
  740. step1[10] = input[10];
  741. step1[11] = input[26];
  742. step1[12] = input[6];
  743. step1[13] = input[22];
  744. step1[14] = input[14];
  745. step1[15] = input[30];
  746. temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
  747. temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
  748. step1[16] = WRAPLOW(dct_const_round_shift(temp1));
  749. step1[31] = WRAPLOW(dct_const_round_shift(temp2));
  750. temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
  751. temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
  752. step1[17] = WRAPLOW(dct_const_round_shift(temp1));
  753. step1[30] = WRAPLOW(dct_const_round_shift(temp2));
  754. temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
  755. temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
  756. step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  757. step1[29] = WRAPLOW(dct_const_round_shift(temp2));
  758. temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
  759. temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
  760. step1[19] = WRAPLOW(dct_const_round_shift(temp1));
  761. step1[28] = WRAPLOW(dct_const_round_shift(temp2));
  762. temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
  763. temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
  764. step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  765. step1[27] = WRAPLOW(dct_const_round_shift(temp2));
  766. temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
  767. temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
  768. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  769. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  770. temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
  771. temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
  772. step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  773. step1[25] = WRAPLOW(dct_const_round_shift(temp2));
  774. temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
  775. temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
  776. step1[23] = WRAPLOW(dct_const_round_shift(temp1));
  777. step1[24] = WRAPLOW(dct_const_round_shift(temp2));
  778. // stage 2
  779. step2[0] = step1[0];
  780. step2[1] = step1[1];
  781. step2[2] = step1[2];
  782. step2[3] = step1[3];
  783. step2[4] = step1[4];
  784. step2[5] = step1[5];
  785. step2[6] = step1[6];
  786. step2[7] = step1[7];
  787. temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  788. temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
  789. step2[8] = WRAPLOW(dct_const_round_shift(temp1));
  790. step2[15] = WRAPLOW(dct_const_round_shift(temp2));
  791. temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  792. temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
  793. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  794. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  795. temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  796. temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
  797. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  798. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  799. temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  800. temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
  801. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  802. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  803. step2[16] = WRAPLOW(step1[16] + step1[17]);
  804. step2[17] = WRAPLOW(step1[16] - step1[17]);
  805. step2[18] = WRAPLOW(-step1[18] + step1[19]);
  806. step2[19] = WRAPLOW(step1[18] + step1[19]);
  807. step2[20] = WRAPLOW(step1[20] + step1[21]);
  808. step2[21] = WRAPLOW(step1[20] - step1[21]);
  809. step2[22] = WRAPLOW(-step1[22] + step1[23]);
  810. step2[23] = WRAPLOW(step1[22] + step1[23]);
  811. step2[24] = WRAPLOW(step1[24] + step1[25]);
  812. step2[25] = WRAPLOW(step1[24] - step1[25]);
  813. step2[26] = WRAPLOW(-step1[26] + step1[27]);
  814. step2[27] = WRAPLOW(step1[26] + step1[27]);
  815. step2[28] = WRAPLOW(step1[28] + step1[29]);
  816. step2[29] = WRAPLOW(step1[28] - step1[29]);
  817. step2[30] = WRAPLOW(-step1[30] + step1[31]);
  818. step2[31] = WRAPLOW(step1[30] + step1[31]);
  819. // stage 3
  820. step1[0] = step2[0];
  821. step1[1] = step2[1];
  822. step1[2] = step2[2];
  823. step1[3] = step2[3];
  824. temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  825. temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  826. step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  827. step1[7] = WRAPLOW(dct_const_round_shift(temp2));
  828. temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  829. temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  830. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  831. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  832. step1[8] = WRAPLOW(step2[8] + step2[9]);
  833. step1[9] = WRAPLOW(step2[8] - step2[9]);
  834. step1[10] = WRAPLOW(-step2[10] + step2[11]);
  835. step1[11] = WRAPLOW(step2[10] + step2[11]);
  836. step1[12] = WRAPLOW(step2[12] + step2[13]);
  837. step1[13] = WRAPLOW(step2[12] - step2[13]);
  838. step1[14] = WRAPLOW(-step2[14] + step2[15]);
  839. step1[15] = WRAPLOW(step2[14] + step2[15]);
  840. step1[16] = step2[16];
  841. step1[31] = step2[31];
  842. temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
  843. temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
  844. step1[17] = WRAPLOW(dct_const_round_shift(temp1));
  845. step1[30] = WRAPLOW(dct_const_round_shift(temp2));
  846. temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
  847. temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
  848. step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  849. step1[29] = WRAPLOW(dct_const_round_shift(temp2));
  850. step1[19] = step2[19];
  851. step1[20] = step2[20];
  852. temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
  853. temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
  854. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  855. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  856. temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
  857. temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
  858. step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  859. step1[25] = WRAPLOW(dct_const_round_shift(temp2));
  860. step1[23] = step2[23];
  861. step1[24] = step2[24];
  862. step1[27] = step2[27];
  863. step1[28] = step2[28];
  864. // stage 4
  865. temp1 = (step1[0] + step1[1]) * cospi_16_64;
  866. temp2 = (step1[0] - step1[1]) * cospi_16_64;
  867. step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  868. step2[1] = WRAPLOW(dct_const_round_shift(temp2));
  869. temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  870. temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  871. step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  872. step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  873. step2[4] = WRAPLOW(step1[4] + step1[5]);
  874. step2[5] = WRAPLOW(step1[4] - step1[5]);
  875. step2[6] = WRAPLOW(-step1[6] + step1[7]);
  876. step2[7] = WRAPLOW(step1[6] + step1[7]);
  877. step2[8] = step1[8];
  878. step2[15] = step1[15];
  879. temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  880. temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  881. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  882. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  883. temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  884. temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  885. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  886. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  887. step2[11] = step1[11];
  888. step2[12] = step1[12];
  889. step2[16] = WRAPLOW(step1[16] + step1[19]);
  890. step2[17] = WRAPLOW(step1[17] + step1[18]);
  891. step2[18] = WRAPLOW(step1[17] - step1[18]);
  892. step2[19] = WRAPLOW(step1[16] - step1[19]);
  893. step2[20] = WRAPLOW(-step1[20] + step1[23]);
  894. step2[21] = WRAPLOW(-step1[21] + step1[22]);
  895. step2[22] = WRAPLOW(step1[21] + step1[22]);
  896. step2[23] = WRAPLOW(step1[20] + step1[23]);
  897. step2[24] = WRAPLOW(step1[24] + step1[27]);
  898. step2[25] = WRAPLOW(step1[25] + step1[26]);
  899. step2[26] = WRAPLOW(step1[25] - step1[26]);
  900. step2[27] = WRAPLOW(step1[24] - step1[27]);
  901. step2[28] = WRAPLOW(-step1[28] + step1[31]);
  902. step2[29] = WRAPLOW(-step1[29] + step1[30]);
  903. step2[30] = WRAPLOW(step1[29] + step1[30]);
  904. step2[31] = WRAPLOW(step1[28] + step1[31]);
  905. // stage 5
  906. step1[0] = WRAPLOW(step2[0] + step2[3]);
  907. step1[1] = WRAPLOW(step2[1] + step2[2]);
  908. step1[2] = WRAPLOW(step2[1] - step2[2]);
  909. step1[3] = WRAPLOW(step2[0] - step2[3]);
  910. step1[4] = step2[4];
  911. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  912. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  913. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  914. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  915. step1[7] = step2[7];
  916. step1[8] = WRAPLOW(step2[8] + step2[11]);
  917. step1[9] = WRAPLOW(step2[9] + step2[10]);
  918. step1[10] = WRAPLOW(step2[9] - step2[10]);
  919. step1[11] = WRAPLOW(step2[8] - step2[11]);
  920. step1[12] = WRAPLOW(-step2[12] + step2[15]);
  921. step1[13] = WRAPLOW(-step2[13] + step2[14]);
  922. step1[14] = WRAPLOW(step2[13] + step2[14]);
  923. step1[15] = WRAPLOW(step2[12] + step2[15]);
  924. step1[16] = step2[16];
  925. step1[17] = step2[17];
  926. temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
  927. temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
  928. step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  929. step1[29] = WRAPLOW(dct_const_round_shift(temp2));
  930. temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
  931. temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
  932. step1[19] = WRAPLOW(dct_const_round_shift(temp1));
  933. step1[28] = WRAPLOW(dct_const_round_shift(temp2));
  934. temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
  935. temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
  936. step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  937. step1[27] = WRAPLOW(dct_const_round_shift(temp2));
  938. temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
  939. temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
  940. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  941. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  942. step1[22] = step2[22];
  943. step1[23] = step2[23];
  944. step1[24] = step2[24];
  945. step1[25] = step2[25];
  946. step1[30] = step2[30];
  947. step1[31] = step2[31];
  948. // stage 6
  949. step2[0] = WRAPLOW(step1[0] + step1[7]);
  950. step2[1] = WRAPLOW(step1[1] + step1[6]);
  951. step2[2] = WRAPLOW(step1[2] + step1[5]);
  952. step2[3] = WRAPLOW(step1[3] + step1[4]);
  953. step2[4] = WRAPLOW(step1[3] - step1[4]);
  954. step2[5] = WRAPLOW(step1[2] - step1[5]);
  955. step2[6] = WRAPLOW(step1[1] - step1[6]);
  956. step2[7] = WRAPLOW(step1[0] - step1[7]);
  957. step2[8] = step1[8];
  958. step2[9] = step1[9];
  959. temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  960. temp2 = (step1[10] + step1[13]) * cospi_16_64;
  961. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  962. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  963. temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  964. temp2 = (step1[11] + step1[12]) * cospi_16_64;
  965. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  966. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  967. step2[14] = step1[14];
  968. step2[15] = step1[15];
  969. step2[16] = WRAPLOW(step1[16] + step1[23]);
  970. step2[17] = WRAPLOW(step1[17] + step1[22]);
  971. step2[18] = WRAPLOW(step1[18] + step1[21]);
  972. step2[19] = WRAPLOW(step1[19] + step1[20]);
  973. step2[20] = WRAPLOW(step1[19] - step1[20]);
  974. step2[21] = WRAPLOW(step1[18] - step1[21]);
  975. step2[22] = WRAPLOW(step1[17] - step1[22]);
  976. step2[23] = WRAPLOW(step1[16] - step1[23]);
  977. step2[24] = WRAPLOW(-step1[24] + step1[31]);
  978. step2[25] = WRAPLOW(-step1[25] + step1[30]);
  979. step2[26] = WRAPLOW(-step1[26] + step1[29]);
  980. step2[27] = WRAPLOW(-step1[27] + step1[28]);
  981. step2[28] = WRAPLOW(step1[27] + step1[28]);
  982. step2[29] = WRAPLOW(step1[26] + step1[29]);
  983. step2[30] = WRAPLOW(step1[25] + step1[30]);
  984. step2[31] = WRAPLOW(step1[24] + step1[31]);
  985. // stage 7
  986. step1[0] = WRAPLOW(step2[0] + step2[15]);
  987. step1[1] = WRAPLOW(step2[1] + step2[14]);
  988. step1[2] = WRAPLOW(step2[2] + step2[13]);
  989. step1[3] = WRAPLOW(step2[3] + step2[12]);
  990. step1[4] = WRAPLOW(step2[4] + step2[11]);
  991. step1[5] = WRAPLOW(step2[5] + step2[10]);
  992. step1[6] = WRAPLOW(step2[6] + step2[9]);
  993. step1[7] = WRAPLOW(step2[7] + step2[8]);
  994. step1[8] = WRAPLOW(step2[7] - step2[8]);
  995. step1[9] = WRAPLOW(step2[6] - step2[9]);
  996. step1[10] = WRAPLOW(step2[5] - step2[10]);
  997. step1[11] = WRAPLOW(step2[4] - step2[11]);
  998. step1[12] = WRAPLOW(step2[3] - step2[12]);
  999. step1[13] = WRAPLOW(step2[2] - step2[13]);
  1000. step1[14] = WRAPLOW(step2[1] - step2[14]);
  1001. step1[15] = WRAPLOW(step2[0] - step2[15]);
  1002. step1[16] = step2[16];
  1003. step1[17] = step2[17];
  1004. step1[18] = step2[18];
  1005. step1[19] = step2[19];
  1006. temp1 = (-step2[20] + step2[27]) * cospi_16_64;
  1007. temp2 = (step2[20] + step2[27]) * cospi_16_64;
  1008. step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  1009. step1[27] = WRAPLOW(dct_const_round_shift(temp2));
  1010. temp1 = (-step2[21] + step2[26]) * cospi_16_64;
  1011. temp2 = (step2[21] + step2[26]) * cospi_16_64;
  1012. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  1013. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  1014. temp1 = (-step2[22] + step2[25]) * cospi_16_64;
  1015. temp2 = (step2[22] + step2[25]) * cospi_16_64;
  1016. step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  1017. step1[25] = WRAPLOW(dct_const_round_shift(temp2));
  1018. temp1 = (-step2[23] + step2[24]) * cospi_16_64;
  1019. temp2 = (step2[23] + step2[24]) * cospi_16_64;
  1020. step1[23] = WRAPLOW(dct_const_round_shift(temp1));
  1021. step1[24] = WRAPLOW(dct_const_round_shift(temp2));
  1022. step1[28] = step2[28];
  1023. step1[29] = step2[29];
  1024. step1[30] = step2[30];
  1025. step1[31] = step2[31];
  1026. // final stage
  1027. output[0] = WRAPLOW(step1[0] + step1[31]);
  1028. output[1] = WRAPLOW(step1[1] + step1[30]);
  1029. output[2] = WRAPLOW(step1[2] + step1[29]);
  1030. output[3] = WRAPLOW(step1[3] + step1[28]);
  1031. output[4] = WRAPLOW(step1[4] + step1[27]);
  1032. output[5] = WRAPLOW(step1[5] + step1[26]);
  1033. output[6] = WRAPLOW(step1[6] + step1[25]);
  1034. output[7] = WRAPLOW(step1[7] + step1[24]);
  1035. output[8] = WRAPLOW(step1[8] + step1[23]);
  1036. output[9] = WRAPLOW(step1[9] + step1[22]);
  1037. output[10] = WRAPLOW(step1[10] + step1[21]);
  1038. output[11] = WRAPLOW(step1[11] + step1[20]);
  1039. output[12] = WRAPLOW(step1[12] + step1[19]);
  1040. output[13] = WRAPLOW(step1[13] + step1[18]);
  1041. output[14] = WRAPLOW(step1[14] + step1[17]);
  1042. output[15] = WRAPLOW(step1[15] + step1[16]);
  1043. output[16] = WRAPLOW(step1[15] - step1[16]);
  1044. output[17] = WRAPLOW(step1[14] - step1[17]);
  1045. output[18] = WRAPLOW(step1[13] - step1[18]);
  1046. output[19] = WRAPLOW(step1[12] - step1[19]);
  1047. output[20] = WRAPLOW(step1[11] - step1[20]);
  1048. output[21] = WRAPLOW(step1[10] - step1[21]);
  1049. output[22] = WRAPLOW(step1[9] - step1[22]);
  1050. output[23] = WRAPLOW(step1[8] - step1[23]);
  1051. output[24] = WRAPLOW(step1[7] - step1[24]);
  1052. output[25] = WRAPLOW(step1[6] - step1[25]);
  1053. output[26] = WRAPLOW(step1[5] - step1[26]);
  1054. output[27] = WRAPLOW(step1[4] - step1[27]);
  1055. output[28] = WRAPLOW(step1[3] - step1[28]);
  1056. output[29] = WRAPLOW(step1[2] - step1[29]);
  1057. output[30] = WRAPLOW(step1[1] - step1[30]);
  1058. output[31] = WRAPLOW(step1[0] - step1[31]);
  1059. }
  1060. void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
  1061. int stride) {
  1062. int i, j;
  1063. tran_low_t out[32 * 32];
  1064. tran_low_t *outptr = out;
  1065. tran_low_t temp_in[32], temp_out[32];
  1066. // Rows
  1067. for (i = 0; i < 32; ++i) {
  1068. int16_t zero_coeff = 0;
  1069. for (j = 0; j < 32; ++j) zero_coeff |= input[j];
  1070. if (zero_coeff)
  1071. idct32_c(input, outptr);
  1072. else
  1073. memset(outptr, 0, sizeof(tran_low_t) * 32);
  1074. input += 32;
  1075. outptr += 32;
  1076. }
  1077. // Columns
  1078. for (i = 0; i < 32; ++i) {
  1079. for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
  1080. idct32_c(temp_in, temp_out);
  1081. for (j = 0; j < 32; ++j) {
  1082. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  1083. ROUND_POWER_OF_TWO(temp_out[j], 6));
  1084. }
  1085. }
  1086. }
  1087. void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
  1088. int stride) {
  1089. int i, j;
  1090. tran_low_t out[32 * 32] = { 0 };
  1091. tran_low_t *outptr = out;
  1092. tran_low_t temp_in[32], temp_out[32];
  1093. // Rows
  1094. // Only upper-left 16x16 has non-zero coeff
  1095. for (i = 0; i < 16; ++i) {
  1096. idct32_c(input, outptr);
  1097. input += 32;
  1098. outptr += 32;
  1099. }
  1100. // Columns
  1101. for (i = 0; i < 32; ++i) {
  1102. for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
  1103. idct32_c(temp_in, temp_out);
  1104. for (j = 0; j < 32; ++j) {
  1105. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  1106. ROUND_POWER_OF_TWO(temp_out[j], 6));
  1107. }
  1108. }
  1109. }
  1110. void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
  1111. int stride) {
  1112. int i, j;
  1113. tran_low_t out[32 * 32] = { 0 };
  1114. tran_low_t *outptr = out;
  1115. tran_low_t temp_in[32], temp_out[32];
  1116. // Rows
  1117. // Only upper-left 8x8 has non-zero coeff
  1118. for (i = 0; i < 8; ++i) {
  1119. idct32_c(input, outptr);
  1120. input += 32;
  1121. outptr += 32;
  1122. }
  1123. // Columns
  1124. for (i = 0; i < 32; ++i) {
  1125. for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
  1126. idct32_c(temp_in, temp_out);
  1127. for (j = 0; j < 32; ++j) {
  1128. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  1129. ROUND_POWER_OF_TWO(temp_out[j], 6));
  1130. }
  1131. }
  1132. }
  1133. void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  1134. int i, j;
  1135. tran_high_t a1;
  1136. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  1137. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  1138. a1 = ROUND_POWER_OF_TWO(out, 6);
  1139. for (j = 0; j < 32; ++j) {
  1140. for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
  1141. dest += stride;
  1142. }
  1143. }
  1144. #if CONFIG_VP9_HIGHBITDEPTH
  1145. // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
  1146. // transform amplify bits + 1 bit for contingency in rounding and quantizing
  1147. #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
  1148. static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
  1149. int size) {
  1150. int i;
  1151. for (i = 0; i < size; ++i)
  1152. if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
  1153. return 0;
  1154. }
  1155. void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
  1156. int stride, int bd) {
  1157. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  1158. 0.5 shifts per pixel. */
  1159. int i;
  1160. tran_low_t output[16];
  1161. tran_high_t a1, b1, c1, d1, e1;
  1162. const tran_low_t *ip = input;
  1163. tran_low_t *op = output;
  1164. for (i = 0; i < 4; i++) {
  1165. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  1166. c1 = ip[1] >> UNIT_QUANT_SHIFT;
  1167. d1 = ip[2] >> UNIT_QUANT_SHIFT;
  1168. b1 = ip[3] >> UNIT_QUANT_SHIFT;
  1169. a1 += c1;
  1170. d1 -= b1;
  1171. e1 = (a1 - d1) >> 1;
  1172. b1 = e1 - b1;
  1173. c1 = e1 - c1;
  1174. a1 -= b1;
  1175. d1 += c1;
  1176. op[0] = HIGHBD_WRAPLOW(a1, bd);
  1177. op[1] = HIGHBD_WRAPLOW(b1, bd);
  1178. op[2] = HIGHBD_WRAPLOW(c1, bd);
  1179. op[3] = HIGHBD_WRAPLOW(d1, bd);
  1180. ip += 4;
  1181. op += 4;
  1182. }
  1183. ip = output;
  1184. for (i = 0; i < 4; i++) {
  1185. a1 = ip[4 * 0];
  1186. c1 = ip[4 * 1];
  1187. d1 = ip[4 * 2];
  1188. b1 = ip[4 * 3];
  1189. a1 += c1;
  1190. d1 -= b1;
  1191. e1 = (a1 - d1) >> 1;
  1192. b1 = e1 - b1;
  1193. c1 = e1 - c1;
  1194. a1 -= b1;
  1195. d1 += c1;
  1196. dest[stride * 0] =
  1197. highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
  1198. dest[stride * 1] =
  1199. highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
  1200. dest[stride * 2] =
  1201. highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
  1202. dest[stride * 3] =
  1203. highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
  1204. ip++;
  1205. dest++;
  1206. }
  1207. }
  1208. void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
  1209. int stride, int bd) {
  1210. int i;
  1211. tran_high_t a1, e1;
  1212. tran_low_t tmp[4];
  1213. const tran_low_t *ip = in;
  1214. tran_low_t *op = tmp;
  1215. (void)bd;
  1216. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  1217. e1 = a1 >> 1;
  1218. a1 -= e1;
  1219. op[0] = HIGHBD_WRAPLOW(a1, bd);
  1220. op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
  1221. ip = tmp;
  1222. for (i = 0; i < 4; i++) {
  1223. e1 = ip[0] >> 1;
  1224. a1 = ip[0] - e1;
  1225. dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
  1226. dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
  1227. dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
  1228. dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
  1229. ip++;
  1230. dest++;
  1231. }
  1232. }
  1233. void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1234. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  1235. tran_low_t x0 = input[0];
  1236. tran_low_t x1 = input[1];
  1237. tran_low_t x2 = input[2];
  1238. tran_low_t x3 = input[3];
  1239. (void)bd;
  1240. if (detect_invalid_highbd_input(input, 4)) {
  1241. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1242. assert(0 && "invalid highbd txfm input");
  1243. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1244. memset(output, 0, sizeof(*output) * 4);
  1245. return;
  1246. }
  1247. if (!(x0 | x1 | x2 | x3)) {
  1248. memset(output, 0, 4 * sizeof(*output));
  1249. return;
  1250. }
  1251. s0 = (tran_high_t)sinpi_1_9 * x0;
  1252. s1 = (tran_high_t)sinpi_2_9 * x0;
  1253. s2 = (tran_high_t)sinpi_3_9 * x1;
  1254. s3 = (tran_high_t)sinpi_4_9 * x2;
  1255. s4 = (tran_high_t)sinpi_1_9 * x2;
  1256. s5 = (tran_high_t)sinpi_2_9 * x3;
  1257. s6 = (tran_high_t)sinpi_4_9 * x3;
  1258. s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
  1259. s0 = s0 + s3 + s5;
  1260. s1 = s1 - s4 - s6;
  1261. s3 = s2;
  1262. s2 = sinpi_3_9 * s7;
  1263. // 1-D transform scaling factor is sqrt(2).
  1264. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  1265. // + 1b (addition) = 29b.
  1266. // Hence the output bit depth is 15b.
  1267. output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
  1268. output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
  1269. output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
  1270. output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
  1271. }
  1272. void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1273. tran_low_t step[4];
  1274. tran_high_t temp1, temp2;
  1275. (void)bd;
  1276. if (detect_invalid_highbd_input(input, 4)) {
  1277. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1278. assert(0 && "invalid highbd txfm input");
  1279. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1280. memset(output, 0, sizeof(*output) * 4);
  1281. return;
  1282. }
  1283. // stage 1
  1284. temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
  1285. temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
  1286. step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1287. step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1288. temp1 =
  1289. input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
  1290. temp2 =
  1291. input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
  1292. step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1293. step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1294. // stage 2
  1295. output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
  1296. output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
  1297. output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
  1298. output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
  1299. }
  1300. void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
  1301. int stride, int bd) {
  1302. int i, j;
  1303. tran_low_t out[4 * 4];
  1304. tran_low_t *outptr = out;
  1305. tran_low_t temp_in[4], temp_out[4];
  1306. // Rows
  1307. for (i = 0; i < 4; ++i) {
  1308. vpx_highbd_idct4_c(input, outptr, bd);
  1309. input += 4;
  1310. outptr += 4;
  1311. }
  1312. // Columns
  1313. for (i = 0; i < 4; ++i) {
  1314. for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
  1315. vpx_highbd_idct4_c(temp_in, temp_out, bd);
  1316. for (j = 0; j < 4; ++j) {
  1317. dest[j * stride + i] = highbd_clip_pixel_add(
  1318. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
  1319. }
  1320. }
  1321. }
  1322. void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
  1323. int stride, int bd) {
  1324. int i;
  1325. tran_high_t a1;
  1326. tran_low_t out = HIGHBD_WRAPLOW(
  1327. dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
  1328. out =
  1329. HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
  1330. a1 = ROUND_POWER_OF_TWO(out, 4);
  1331. for (i = 0; i < 4; i++) {
  1332. dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
  1333. dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
  1334. dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
  1335. dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
  1336. dest += stride;
  1337. }
  1338. }
  1339. void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1340. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  1341. tran_low_t x0 = input[7];
  1342. tran_low_t x1 = input[0];
  1343. tran_low_t x2 = input[5];
  1344. tran_low_t x3 = input[2];
  1345. tran_low_t x4 = input[3];
  1346. tran_low_t x5 = input[4];
  1347. tran_low_t x6 = input[1];
  1348. tran_low_t x7 = input[6];
  1349. (void)bd;
  1350. if (detect_invalid_highbd_input(input, 8)) {
  1351. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1352. assert(0 && "invalid highbd txfm input");
  1353. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1354. memset(output, 0, sizeof(*output) * 8);
  1355. return;
  1356. }
  1357. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
  1358. memset(output, 0, 8 * sizeof(*output));
  1359. return;
  1360. }
  1361. // stage 1
  1362. s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
  1363. s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
  1364. s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
  1365. s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
  1366. s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
  1367. s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
  1368. s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
  1369. s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
  1370. x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
  1371. x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
  1372. x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
  1373. x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
  1374. x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
  1375. x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
  1376. x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
  1377. x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
  1378. // stage 2
  1379. s0 = x0;
  1380. s1 = x1;
  1381. s2 = x2;
  1382. s3 = x3;
  1383. s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
  1384. s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
  1385. s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
  1386. s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
  1387. x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
  1388. x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
  1389. x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
  1390. x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
  1391. x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
  1392. x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
  1393. x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
  1394. x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
  1395. // stage 3
  1396. s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
  1397. s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
  1398. s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
  1399. s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
  1400. x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
  1401. x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
  1402. x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
  1403. x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
  1404. output[0] = HIGHBD_WRAPLOW(x0, bd);
  1405. output[1] = HIGHBD_WRAPLOW(-x4, bd);
  1406. output[2] = HIGHBD_WRAPLOW(x6, bd);
  1407. output[3] = HIGHBD_WRAPLOW(-x2, bd);
  1408. output[4] = HIGHBD_WRAPLOW(x3, bd);
  1409. output[5] = HIGHBD_WRAPLOW(-x7, bd);
  1410. output[6] = HIGHBD_WRAPLOW(x5, bd);
  1411. output[7] = HIGHBD_WRAPLOW(-x1, bd);
  1412. }
  1413. void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1414. tran_low_t step1[8], step2[8];
  1415. tran_high_t temp1, temp2;
  1416. if (detect_invalid_highbd_input(input, 8)) {
  1417. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1418. assert(0 && "invalid highbd txfm input");
  1419. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1420. memset(output, 0, sizeof(*output) * 8);
  1421. return;
  1422. }
  1423. // stage 1
  1424. step1[0] = input[0];
  1425. step1[2] = input[4];
  1426. step1[1] = input[2];
  1427. step1[3] = input[6];
  1428. temp1 =
  1429. input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
  1430. temp2 =
  1431. input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
  1432. step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1433. step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1434. temp1 =
  1435. input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
  1436. temp2 =
  1437. input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
  1438. step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1439. step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1440. // stage 2 & stage 3 - even half
  1441. vpx_highbd_idct4_c(step1, step1, bd);
  1442. // stage 2 - odd half
  1443. step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
  1444. step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
  1445. step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
  1446. step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
  1447. // stage 3 - odd half
  1448. step1[4] = step2[4];
  1449. temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
  1450. temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
  1451. step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1452. step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1453. step1[7] = step2[7];
  1454. // stage 4
  1455. output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
  1456. output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
  1457. output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
  1458. output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
  1459. output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
  1460. output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
  1461. output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
  1462. output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  1463. }
  1464. void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
  1465. int stride, int bd) {
  1466. int i, j;
  1467. tran_low_t out[8 * 8];
  1468. tran_low_t *outptr = out;
  1469. tran_low_t temp_in[8], temp_out[8];
  1470. // First transform rows
  1471. for (i = 0; i < 8; ++i) {
  1472. vpx_highbd_idct8_c(input, outptr, bd);
  1473. input += 8;
  1474. outptr += 8;
  1475. }
  1476. // Then transform columns
  1477. for (i = 0; i < 8; ++i) {
  1478. for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
  1479. vpx_highbd_idct8_c(temp_in, temp_out, bd);
  1480. for (j = 0; j < 8; ++j) {
  1481. dest[j * stride + i] = highbd_clip_pixel_add(
  1482. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
  1483. }
  1484. }
  1485. }
  1486. void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
  1487. int stride, int bd) {
  1488. int i, j;
  1489. tran_low_t out[8 * 8] = { 0 };
  1490. tran_low_t *outptr = out;
  1491. tran_low_t temp_in[8], temp_out[8];
  1492. // First transform rows
  1493. // Only first 4 row has non-zero coefs
  1494. for (i = 0; i < 4; ++i) {
  1495. vpx_highbd_idct8_c(input, outptr, bd);
  1496. input += 8;
  1497. outptr += 8;
  1498. }
  1499. // Then transform columns
  1500. for (i = 0; i < 8; ++i) {
  1501. for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
  1502. vpx_highbd_idct8_c(temp_in, temp_out, bd);
  1503. for (j = 0; j < 8; ++j) {
  1504. dest[j * stride + i] = highbd_clip_pixel_add(
  1505. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
  1506. }
  1507. }
  1508. }
  1509. void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
  1510. int stride, int bd) {
  1511. int i, j;
  1512. tran_high_t a1;
  1513. tran_low_t out = HIGHBD_WRAPLOW(
  1514. dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
  1515. out =
  1516. HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
  1517. a1 = ROUND_POWER_OF_TWO(out, 5);
  1518. for (j = 0; j < 8; ++j) {
  1519. for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
  1520. dest += stride;
  1521. }
  1522. }
  1523. void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1524. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  1525. tran_high_t s9, s10, s11, s12, s13, s14, s15;
  1526. tran_low_t x0 = input[15];
  1527. tran_low_t x1 = input[0];
  1528. tran_low_t x2 = input[13];
  1529. tran_low_t x3 = input[2];
  1530. tran_low_t x4 = input[11];
  1531. tran_low_t x5 = input[4];
  1532. tran_low_t x6 = input[9];
  1533. tran_low_t x7 = input[6];
  1534. tran_low_t x8 = input[7];
  1535. tran_low_t x9 = input[8];
  1536. tran_low_t x10 = input[5];
  1537. tran_low_t x11 = input[10];
  1538. tran_low_t x12 = input[3];
  1539. tran_low_t x13 = input[12];
  1540. tran_low_t x14 = input[1];
  1541. tran_low_t x15 = input[14];
  1542. (void)bd;
  1543. if (detect_invalid_highbd_input(input, 16)) {
  1544. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1545. assert(0 && "invalid highbd txfm input");
  1546. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1547. memset(output, 0, sizeof(*output) * 16);
  1548. return;
  1549. }
  1550. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
  1551. x13 | x14 | x15)) {
  1552. memset(output, 0, 16 * sizeof(*output));
  1553. return;
  1554. }
  1555. // stage 1
  1556. s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
  1557. s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
  1558. s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
  1559. s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
  1560. s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
  1561. s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
  1562. s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
  1563. s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
  1564. s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
  1565. s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
  1566. s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
  1567. s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
  1568. s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
  1569. s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
  1570. s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
  1571. s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
  1572. x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
  1573. x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
  1574. x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
  1575. x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
  1576. x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
  1577. x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
  1578. x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
  1579. x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
  1580. x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
  1581. x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
  1582. x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
  1583. x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
  1584. x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
  1585. x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
  1586. x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
  1587. x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
  1588. // stage 2
  1589. s0 = x0;
  1590. s1 = x1;
  1591. s2 = x2;
  1592. s3 = x3;
  1593. s4 = x4;
  1594. s5 = x5;
  1595. s6 = x6;
  1596. s7 = x7;
  1597. s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
  1598. s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
  1599. s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
  1600. s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
  1601. s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
  1602. s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
  1603. s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
  1604. s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
  1605. x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
  1606. x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
  1607. x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
  1608. x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
  1609. x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
  1610. x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
  1611. x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
  1612. x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
  1613. x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
  1614. x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
  1615. x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
  1616. x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
  1617. x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
  1618. x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
  1619. x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
  1620. x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
  1621. // stage 3
  1622. s0 = x0;
  1623. s1 = x1;
  1624. s2 = x2;
  1625. s3 = x3;
  1626. s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
  1627. s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
  1628. s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
  1629. s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
  1630. s8 = x8;
  1631. s9 = x9;
  1632. s10 = x10;
  1633. s11 = x11;
  1634. s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
  1635. s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
  1636. s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
  1637. s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
  1638. x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
  1639. x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
  1640. x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
  1641. x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
  1642. x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
  1643. x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
  1644. x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
  1645. x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
  1646. x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
  1647. x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
  1648. x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
  1649. x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
  1650. x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
  1651. x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
  1652. x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
  1653. x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
  1654. // stage 4
  1655. s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
  1656. s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
  1657. s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
  1658. s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
  1659. s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
  1660. s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
  1661. s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
  1662. s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
  1663. x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
  1664. x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
  1665. x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
  1666. x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
  1667. x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
  1668. x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
  1669. x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
  1670. x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
  1671. output[0] = HIGHBD_WRAPLOW(x0, bd);
  1672. output[1] = HIGHBD_WRAPLOW(-x8, bd);
  1673. output[2] = HIGHBD_WRAPLOW(x12, bd);
  1674. output[3] = HIGHBD_WRAPLOW(-x4, bd);
  1675. output[4] = HIGHBD_WRAPLOW(x6, bd);
  1676. output[5] = HIGHBD_WRAPLOW(x14, bd);
  1677. output[6] = HIGHBD_WRAPLOW(x10, bd);
  1678. output[7] = HIGHBD_WRAPLOW(x2, bd);
  1679. output[8] = HIGHBD_WRAPLOW(x3, bd);
  1680. output[9] = HIGHBD_WRAPLOW(x11, bd);
  1681. output[10] = HIGHBD_WRAPLOW(x15, bd);
  1682. output[11] = HIGHBD_WRAPLOW(x7, bd);
  1683. output[12] = HIGHBD_WRAPLOW(x5, bd);
  1684. output[13] = HIGHBD_WRAPLOW(-x13, bd);
  1685. output[14] = HIGHBD_WRAPLOW(x9, bd);
  1686. output[15] = HIGHBD_WRAPLOW(-x1, bd);
  1687. }
  1688. void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1689. tran_low_t step1[16], step2[16];
  1690. tran_high_t temp1, temp2;
  1691. (void)bd;
  1692. if (detect_invalid_highbd_input(input, 16)) {
  1693. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1694. assert(0 && "invalid highbd txfm input");
  1695. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1696. memset(output, 0, sizeof(*output) * 16);
  1697. return;
  1698. }
  1699. // stage 1
  1700. step1[0] = input[0 / 2];
  1701. step1[1] = input[16 / 2];
  1702. step1[2] = input[8 / 2];
  1703. step1[3] = input[24 / 2];
  1704. step1[4] = input[4 / 2];
  1705. step1[5] = input[20 / 2];
  1706. step1[6] = input[12 / 2];
  1707. step1[7] = input[28 / 2];
  1708. step1[8] = input[2 / 2];
  1709. step1[9] = input[18 / 2];
  1710. step1[10] = input[10 / 2];
  1711. step1[11] = input[26 / 2];
  1712. step1[12] = input[6 / 2];
  1713. step1[13] = input[22 / 2];
  1714. step1[14] = input[14 / 2];
  1715. step1[15] = input[30 / 2];
  1716. // stage 2
  1717. step2[0] = step1[0];
  1718. step2[1] = step1[1];
  1719. step2[2] = step1[2];
  1720. step2[3] = step1[3];
  1721. step2[4] = step1[4];
  1722. step2[5] = step1[5];
  1723. step2[6] = step1[6];
  1724. step2[7] = step1[7];
  1725. temp1 =
  1726. step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
  1727. temp2 =
  1728. step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
  1729. step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1730. step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1731. temp1 = step1[9] * (tran_high_t)cospi_14_64 -
  1732. step1[14] * (tran_high_t)cospi_18_64;
  1733. temp2 = step1[9] * (tran_high_t)cospi_18_64 +
  1734. step1[14] * (tran_high_t)cospi_14_64;
  1735. step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1736. step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1737. temp1 = step1[10] * (tran_high_t)cospi_22_64 -
  1738. step1[13] * (tran_high_t)cospi_10_64;
  1739. temp2 = step1[10] * (tran_high_t)cospi_10_64 +
  1740. step1[13] * (tran_high_t)cospi_22_64;
  1741. step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1742. step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1743. temp1 = step1[11] * (tran_high_t)cospi_6_64 -
  1744. step1[12] * (tran_high_t)cospi_26_64;
  1745. temp2 = step1[11] * (tran_high_t)cospi_26_64 +
  1746. step1[12] * (tran_high_t)cospi_6_64;
  1747. step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1748. step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1749. // stage 3
  1750. step1[0] = step2[0];
  1751. step1[1] = step2[1];
  1752. step1[2] = step2[2];
  1753. step1[3] = step2[3];
  1754. temp1 =
  1755. step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
  1756. temp2 =
  1757. step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
  1758. step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1759. step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1760. temp1 =
  1761. step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
  1762. temp2 =
  1763. step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
  1764. step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1765. step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1766. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
  1767. step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
  1768. step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
  1769. step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
  1770. step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
  1771. step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
  1772. step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
  1773. step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
  1774. // stage 4
  1775. temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
  1776. temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
  1777. step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1778. step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1779. temp1 =
  1780. step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
  1781. temp2 =
  1782. step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
  1783. step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1784. step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1785. step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
  1786. step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
  1787. step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
  1788. step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
  1789. step2[8] = step1[8];
  1790. step2[15] = step1[15];
  1791. temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
  1792. step1[14] * (tran_high_t)cospi_24_64;
  1793. temp2 =
  1794. step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
  1795. step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1796. step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1797. temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
  1798. step1[13] * (tran_high_t)cospi_8_64;
  1799. temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
  1800. step1[13] * (tran_high_t)cospi_24_64;
  1801. step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1802. step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1803. step2[11] = step1[11];
  1804. step2[12] = step1[12];
  1805. // stage 5
  1806. step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
  1807. step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
  1808. step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
  1809. step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
  1810. step1[4] = step2[4];
  1811. temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
  1812. temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
  1813. step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1814. step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1815. step1[7] = step2[7];
  1816. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
  1817. step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
  1818. step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
  1819. step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
  1820. step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
  1821. step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
  1822. step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
  1823. step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
  1824. // stage 6
  1825. step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
  1826. step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
  1827. step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
  1828. step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
  1829. step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
  1830. step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
  1831. step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
  1832. step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  1833. step2[8] = step1[8];
  1834. step2[9] = step1[9];
  1835. temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
  1836. temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
  1837. step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1838. step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1839. temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
  1840. temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
  1841. step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1842. step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1843. step2[14] = step1[14];
  1844. step2[15] = step1[15];
  1845. // stage 7
  1846. output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
  1847. output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
  1848. output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
  1849. output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
  1850. output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
  1851. output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
  1852. output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
  1853. output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
  1854. output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
  1855. output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
  1856. output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
  1857. output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
  1858. output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
  1859. output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
  1860. output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
  1861. output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
  1862. }
  1863. void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
  1864. int stride, int bd) {
  1865. int i, j;
  1866. tran_low_t out[16 * 16];
  1867. tran_low_t *outptr = out;
  1868. tran_low_t temp_in[16], temp_out[16];
  1869. // First transform rows
  1870. for (i = 0; i < 16; ++i) {
  1871. vpx_highbd_idct16_c(input, outptr, bd);
  1872. input += 16;
  1873. outptr += 16;
  1874. }
  1875. // Then transform columns
  1876. for (i = 0; i < 16; ++i) {
  1877. for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
  1878. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  1879. for (j = 0; j < 16; ++j) {
  1880. dest[j * stride + i] = highbd_clip_pixel_add(
  1881. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  1882. }
  1883. }
  1884. }
  1885. void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
  1886. int stride, int bd) {
  1887. int i, j;
  1888. tran_low_t out[16 * 16] = { 0 };
  1889. tran_low_t *outptr = out;
  1890. tran_low_t temp_in[16], temp_out[16];
  1891. // First transform rows. Since all non-zero dct coefficients are in
  1892. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  1893. for (i = 0; i < 8; ++i) {
  1894. vpx_highbd_idct16_c(input, outptr, bd);
  1895. input += 16;
  1896. outptr += 16;
  1897. }
  1898. // Then transform columns
  1899. for (i = 0; i < 16; ++i) {
  1900. uint16_t *destT = dest;
  1901. for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
  1902. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  1903. for (j = 0; j < 16; ++j) {
  1904. destT[i] = highbd_clip_pixel_add(destT[i],
  1905. ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  1906. destT += stride;
  1907. }
  1908. }
  1909. }
  1910. void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
  1911. int stride, int bd) {
  1912. int i, j;
  1913. tran_low_t out[16 * 16] = { 0 };
  1914. tran_low_t *outptr = out;
  1915. tran_low_t temp_in[16], temp_out[16];
  1916. // First transform rows. Since all non-zero dct coefficients are in
  1917. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  1918. for (i = 0; i < 4; ++i) {
  1919. vpx_highbd_idct16_c(input, outptr, bd);
  1920. input += 16;
  1921. outptr += 16;
  1922. }
  1923. // Then transform columns
  1924. for (i = 0; i < 16; ++i) {
  1925. for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
  1926. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  1927. for (j = 0; j < 16; ++j) {
  1928. dest[j * stride + i] = highbd_clip_pixel_add(
  1929. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  1930. }
  1931. }
  1932. }
  1933. void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
  1934. int stride, int bd) {
  1935. int i, j;
  1936. tran_high_t a1;
  1937. tran_low_t out = HIGHBD_WRAPLOW(
  1938. dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
  1939. out =
  1940. HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
  1941. a1 = ROUND_POWER_OF_TWO(out, 6);
  1942. for (j = 0; j < 16; ++j) {
  1943. for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
  1944. dest += stride;
  1945. }
  1946. }
  1947. static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
  1948. int bd) {
  1949. tran_low_t step1[32], step2[32];
  1950. tran_high_t temp1, temp2;
  1951. (void)bd;
  1952. if (detect_invalid_highbd_input(input, 32)) {
  1953. #if CONFIG_COEFFICIENT_RANGE_CHECKING
  1954. assert(0 && "invalid highbd txfm input");
  1955. #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
  1956. memset(output, 0, sizeof(*output) * 32);
  1957. return;
  1958. }
  1959. // stage 1
  1960. step1[0] = input[0];
  1961. step1[1] = input[16];
  1962. step1[2] = input[8];
  1963. step1[3] = input[24];
  1964. step1[4] = input[4];
  1965. step1[5] = input[20];
  1966. step1[6] = input[12];
  1967. step1[7] = input[28];
  1968. step1[8] = input[2];
  1969. step1[9] = input[18];
  1970. step1[10] = input[10];
  1971. step1[11] = input[26];
  1972. step1[12] = input[6];
  1973. step1[13] = input[22];
  1974. step1[14] = input[14];
  1975. step1[15] = input[30];
  1976. temp1 =
  1977. input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
  1978. temp2 =
  1979. input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
  1980. step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1981. step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1982. temp1 = input[17] * (tran_high_t)cospi_15_64 -
  1983. input[15] * (tran_high_t)cospi_17_64;
  1984. temp2 = input[17] * (tran_high_t)cospi_17_64 +
  1985. input[15] * (tran_high_t)cospi_15_64;
  1986. step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1987. step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1988. temp1 =
  1989. input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
  1990. temp2 =
  1991. input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
  1992. step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1993. step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  1994. temp1 =
  1995. input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
  1996. temp2 =
  1997. input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
  1998. step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  1999. step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2000. temp1 =
  2001. input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
  2002. temp2 =
  2003. input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
  2004. step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2005. step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2006. temp1 = input[21] * (tran_high_t)cospi_11_64 -
  2007. input[11] * (tran_high_t)cospi_21_64;
  2008. temp2 = input[21] * (tran_high_t)cospi_21_64 +
  2009. input[11] * (tran_high_t)cospi_11_64;
  2010. step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2011. step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2012. temp1 = input[13] * (tran_high_t)cospi_19_64 -
  2013. input[19] * (tran_high_t)cospi_13_64;
  2014. temp2 = input[13] * (tran_high_t)cospi_13_64 +
  2015. input[19] * (tran_high_t)cospi_19_64;
  2016. step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2017. step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2018. temp1 =
  2019. input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
  2020. temp2 =
  2021. input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
  2022. step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2023. step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2024. // stage 2
  2025. step2[0] = step1[0];
  2026. step2[1] = step1[1];
  2027. step2[2] = step1[2];
  2028. step2[3] = step1[3];
  2029. step2[4] = step1[4];
  2030. step2[5] = step1[5];
  2031. step2[6] = step1[6];
  2032. step2[7] = step1[7];
  2033. temp1 =
  2034. step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
  2035. temp2 =
  2036. step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
  2037. step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2038. step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2039. temp1 = step1[9] * (tran_high_t)cospi_14_64 -
  2040. step1[14] * (tran_high_t)cospi_18_64;
  2041. temp2 = step1[9] * (tran_high_t)cospi_18_64 +
  2042. step1[14] * (tran_high_t)cospi_14_64;
  2043. step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2044. step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2045. temp1 = step1[10] * (tran_high_t)cospi_22_64 -
  2046. step1[13] * (tran_high_t)cospi_10_64;
  2047. temp2 = step1[10] * (tran_high_t)cospi_10_64 +
  2048. step1[13] * (tran_high_t)cospi_22_64;
  2049. step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2050. step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2051. temp1 = step1[11] * (tran_high_t)cospi_6_64 -
  2052. step1[12] * (tran_high_t)cospi_26_64;
  2053. temp2 = step1[11] * (tran_high_t)cospi_26_64 +
  2054. step1[12] * (tran_high_t)cospi_6_64;
  2055. step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2056. step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2057. step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
  2058. step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
  2059. step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
  2060. step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
  2061. step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
  2062. step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
  2063. step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
  2064. step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
  2065. step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
  2066. step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
  2067. step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
  2068. step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
  2069. step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
  2070. step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
  2071. step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
  2072. step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
  2073. // stage 3
  2074. step1[0] = step2[0];
  2075. step1[1] = step2[1];
  2076. step1[2] = step2[2];
  2077. step1[3] = step2[3];
  2078. temp1 =
  2079. step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
  2080. temp2 =
  2081. step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
  2082. step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2083. step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2084. temp1 =
  2085. step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
  2086. temp2 =
  2087. step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
  2088. step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2089. step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2090. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
  2091. step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
  2092. step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
  2093. step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
  2094. step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
  2095. step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
  2096. step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
  2097. step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
  2098. step1[16] = step2[16];
  2099. step1[31] = step2[31];
  2100. temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
  2101. step2[30] * (tran_high_t)cospi_28_64;
  2102. temp2 = step2[17] * (tran_high_t)cospi_28_64 +
  2103. step2[30] * (tran_high_t)cospi_4_64;
  2104. step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2105. step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2106. temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
  2107. step2[29] * (tran_high_t)cospi_4_64;
  2108. temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
  2109. step2[29] * (tran_high_t)cospi_28_64;
  2110. step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2111. step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2112. step1[19] = step2[19];
  2113. step1[20] = step2[20];
  2114. temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
  2115. step2[26] * (tran_high_t)cospi_12_64;
  2116. temp2 = step2[21] * (tran_high_t)cospi_12_64 +
  2117. step2[26] * (tran_high_t)cospi_20_64;
  2118. step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2119. step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2120. temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
  2121. step2[25] * (tran_high_t)cospi_20_64;
  2122. temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
  2123. step2[25] * (tran_high_t)cospi_12_64;
  2124. step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2125. step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2126. step1[23] = step2[23];
  2127. step1[24] = step2[24];
  2128. step1[27] = step2[27];
  2129. step1[28] = step2[28];
  2130. // stage 4
  2131. temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
  2132. temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
  2133. step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2134. step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2135. temp1 =
  2136. step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
  2137. temp2 =
  2138. step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
  2139. step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2140. step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2141. step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
  2142. step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
  2143. step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
  2144. step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
  2145. step2[8] = step1[8];
  2146. step2[15] = step1[15];
  2147. temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
  2148. step1[14] * (tran_high_t)cospi_24_64;
  2149. temp2 =
  2150. step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
  2151. step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2152. step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2153. temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
  2154. step1[13] * (tran_high_t)cospi_8_64;
  2155. temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
  2156. step1[13] * (tran_high_t)cospi_24_64;
  2157. step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2158. step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2159. step2[11] = step1[11];
  2160. step2[12] = step1[12];
  2161. step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
  2162. step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
  2163. step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
  2164. step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
  2165. step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
  2166. step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
  2167. step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
  2168. step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
  2169. step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
  2170. step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
  2171. step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
  2172. step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
  2173. step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
  2174. step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
  2175. step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
  2176. step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
  2177. // stage 5
  2178. step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
  2179. step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
  2180. step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
  2181. step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
  2182. step1[4] = step2[4];
  2183. temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
  2184. temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
  2185. step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2186. step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2187. step1[7] = step2[7];
  2188. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
  2189. step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
  2190. step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
  2191. step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
  2192. step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
  2193. step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
  2194. step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
  2195. step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
  2196. step1[16] = step2[16];
  2197. step1[17] = step2[17];
  2198. temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
  2199. step2[29] * (tran_high_t)cospi_24_64;
  2200. temp2 = step2[18] * (tran_high_t)cospi_24_64 +
  2201. step2[29] * (tran_high_t)cospi_8_64;
  2202. step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2203. step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2204. temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
  2205. step2[28] * (tran_high_t)cospi_24_64;
  2206. temp2 = step2[19] * (tran_high_t)cospi_24_64 +
  2207. step2[28] * (tran_high_t)cospi_8_64;
  2208. step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2209. step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2210. temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
  2211. step2[27] * (tran_high_t)cospi_8_64;
  2212. temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
  2213. step2[27] * (tran_high_t)cospi_24_64;
  2214. step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2215. step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2216. temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
  2217. step2[26] * (tran_high_t)cospi_8_64;
  2218. temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
  2219. step2[26] * (tran_high_t)cospi_24_64;
  2220. step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2221. step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2222. step1[22] = step2[22];
  2223. step1[23] = step2[23];
  2224. step1[24] = step2[24];
  2225. step1[25] = step2[25];
  2226. step1[30] = step2[30];
  2227. step1[31] = step2[31];
  2228. // stage 6
  2229. step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
  2230. step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
  2231. step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
  2232. step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
  2233. step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
  2234. step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
  2235. step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
  2236. step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  2237. step2[8] = step1[8];
  2238. step2[9] = step1[9];
  2239. temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
  2240. temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
  2241. step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2242. step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2243. temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
  2244. temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
  2245. step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2246. step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2247. step2[14] = step1[14];
  2248. step2[15] = step1[15];
  2249. step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
  2250. step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
  2251. step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
  2252. step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
  2253. step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
  2254. step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
  2255. step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
  2256. step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
  2257. step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
  2258. step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
  2259. step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
  2260. step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
  2261. step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
  2262. step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
  2263. step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
  2264. step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
  2265. // stage 7
  2266. step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
  2267. step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
  2268. step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
  2269. step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
  2270. step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
  2271. step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
  2272. step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
  2273. step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
  2274. step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
  2275. step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
  2276. step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
  2277. step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
  2278. step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
  2279. step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
  2280. step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
  2281. step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
  2282. step1[16] = step2[16];
  2283. step1[17] = step2[17];
  2284. step1[18] = step2[18];
  2285. step1[19] = step2[19];
  2286. temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
  2287. temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
  2288. step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2289. step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2290. temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
  2291. temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
  2292. step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2293. step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2294. temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
  2295. temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
  2296. step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2297. step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2298. temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
  2299. temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
  2300. step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
  2301. step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
  2302. step1[28] = step2[28];
  2303. step1[29] = step2[29];
  2304. step1[30] = step2[30];
  2305. step1[31] = step2[31];
  2306. // final stage
  2307. output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
  2308. output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
  2309. output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
  2310. output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
  2311. output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
  2312. output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
  2313. output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
  2314. output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
  2315. output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
  2316. output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
  2317. output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
  2318. output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
  2319. output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
  2320. output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
  2321. output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
  2322. output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
  2323. output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
  2324. output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
  2325. output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
  2326. output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
  2327. output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
  2328. output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
  2329. output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
  2330. output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
  2331. output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
  2332. output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
  2333. output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
  2334. output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
  2335. output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
  2336. output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
  2337. output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
  2338. output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
  2339. }
  2340. void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
  2341. int stride, int bd) {
  2342. int i, j;
  2343. tran_low_t out[32 * 32];
  2344. tran_low_t *outptr = out;
  2345. tran_low_t temp_in[32], temp_out[32];
  2346. // Rows
  2347. for (i = 0; i < 32; ++i) {
  2348. tran_low_t zero_coeff = 0;
  2349. for (j = 0; j < 32; ++j) zero_coeff |= input[j];
  2350. if (zero_coeff)
  2351. highbd_idct32_c(input, outptr, bd);
  2352. else
  2353. memset(outptr, 0, sizeof(tran_low_t) * 32);
  2354. input += 32;
  2355. outptr += 32;
  2356. }
  2357. // Columns
  2358. for (i = 0; i < 32; ++i) {
  2359. for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
  2360. highbd_idct32_c(temp_in, temp_out, bd);
  2361. for (j = 0; j < 32; ++j) {
  2362. dest[j * stride + i] = highbd_clip_pixel_add(
  2363. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  2364. }
  2365. }
  2366. }
  2367. void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
  2368. int stride, int bd) {
  2369. int i, j;
  2370. tran_low_t out[32 * 32] = { 0 };
  2371. tran_low_t *outptr = out;
  2372. tran_low_t temp_in[32], temp_out[32];
  2373. // Rows
  2374. // Only upper-left 16x16 has non-zero coeff
  2375. for (i = 0; i < 16; ++i) {
  2376. highbd_idct32_c(input, outptr, bd);
  2377. input += 32;
  2378. outptr += 32;
  2379. }
  2380. // Columns
  2381. for (i = 0; i < 32; ++i) {
  2382. uint16_t *destT = dest;
  2383. for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
  2384. highbd_idct32_c(temp_in, temp_out, bd);
  2385. for (j = 0; j < 32; ++j) {
  2386. destT[i] = highbd_clip_pixel_add(destT[i],
  2387. ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  2388. destT += stride;
  2389. }
  2390. }
  2391. }
  2392. void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
  2393. int stride, int bd) {
  2394. int i, j;
  2395. tran_low_t out[32 * 32] = { 0 };
  2396. tran_low_t *outptr = out;
  2397. tran_low_t temp_in[32], temp_out[32];
  2398. // Rows
  2399. // Only upper-left 8x8 has non-zero coeff
  2400. for (i = 0; i < 8; ++i) {
  2401. highbd_idct32_c(input, outptr, bd);
  2402. input += 32;
  2403. outptr += 32;
  2404. }
  2405. // Columns
  2406. for (i = 0; i < 32; ++i) {
  2407. for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
  2408. highbd_idct32_c(temp_in, temp_out, bd);
  2409. for (j = 0; j < 32; ++j) {
  2410. dest[j * stride + i] = highbd_clip_pixel_add(
  2411. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  2412. }
  2413. }
  2414. }
  2415. void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
  2416. int stride, int bd) {
  2417. int i, j;
  2418. int a1;
  2419. tran_low_t out = HIGHBD_WRAPLOW(
  2420. dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
  2421. out =
  2422. HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
  2423. a1 = ROUND_POWER_OF_TWO(out, 6);
  2424. for (j = 0; j < 32; ++j) {
  2425. for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
  2426. dest += stride;
  2427. }
  2428. }
  2429. #endif // CONFIG_VP9_HIGHBITDEPTH