threading.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8_rtcd.h"
  12. #if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
  13. #include <unistd.h>
  14. #endif
  15. #include "onyxd_int.h"
  16. #include "vpx_mem/vpx_mem.h"
  17. #include "vp8/common/threading.h"
  18. #include "vp8/common/loopfilter.h"
  19. #include "vp8/common/extend.h"
  20. #include "vpx_ports/vpx_timer.h"
  21. #include "decoderthreading.h"
  22. #include "detokenize.h"
  23. #include "vp8/common/reconintra4x4.h"
  24. #include "vp8/common/reconinter.h"
  25. #include "vp8/common/reconintra.h"
  26. #include "vp8/common/setupintrarecon.h"
  27. #if CONFIG_ERROR_CONCEALMENT
  28. #include "error_concealment.h"
  29. #endif
  30. #define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
  31. #define CALLOC_ARRAY_ALIGNED(p, n, algn) \
  32. do { \
  33. CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
  34. memset((p), 0, (n) * sizeof(*(p))); \
  35. } while (0)
  36. static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
  37. MB_ROW_DEC *mbrd, int count) {
  38. VP8_COMMON *const pc = &pbi->common;
  39. int i;
  40. for (i = 0; i < count; ++i) {
  41. MACROBLOCKD *mbd = &mbrd[i].mbd;
  42. mbd->subpixel_predict = xd->subpixel_predict;
  43. mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
  44. mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
  45. mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
  46. mbd->frame_type = pc->frame_type;
  47. mbd->pre = xd->pre;
  48. mbd->dst = xd->dst;
  49. mbd->segmentation_enabled = xd->segmentation_enabled;
  50. mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
  51. memcpy(mbd->segment_feature_data, xd->segment_feature_data,
  52. sizeof(xd->segment_feature_data));
  53. /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
  54. memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
  55. /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
  56. memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
  57. /*unsigned char mode_ref_lf_delta_enabled;
  58. unsigned char mode_ref_lf_delta_update;*/
  59. mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled;
  60. mbd->mode_ref_lf_delta_update = xd->mode_ref_lf_delta_update;
  61. mbd->current_bc = &pbi->mbc[0];
  62. memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
  63. memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
  64. memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
  65. memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
  66. mbd->fullpixel_mask = 0xffffffff;
  67. if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
  68. }
  69. for (i = 0; i < pc->mb_rows; ++i)
  70. vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1);
  71. }
  72. static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
  73. unsigned int mb_idx) {
  74. MB_PREDICTION_MODE mode;
  75. int i;
  76. #if CONFIG_ERROR_CONCEALMENT
  77. int corruption_detected = 0;
  78. #else
  79. (void)mb_idx;
  80. #endif
  81. if (xd->mode_info_context->mbmi.mb_skip_coeff) {
  82. vp8_reset_mb_tokens_context(xd);
  83. } else if (!vp8dx_bool_error(xd->current_bc)) {
  84. int eobtotal;
  85. eobtotal = vp8_decode_mb_tokens(pbi, xd);
  86. /* Special case: Force the loopfilter to skip when eobtotal is zero */
  87. xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal == 0);
  88. }
  89. mode = xd->mode_info_context->mbmi.mode;
  90. if (xd->segmentation_enabled) vp8_mb_init_dequantizer(pbi, xd);
  91. #if CONFIG_ERROR_CONCEALMENT
  92. if (pbi->ec_active) {
  93. int throw_residual;
  94. /* When we have independent partitions we can apply residual even
  95. * though other partitions within the frame are corrupt.
  96. */
  97. throw_residual =
  98. (!pbi->independent_partitions && pbi->frame_corrupt_residual);
  99. throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
  100. if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) {
  101. /* MB with corrupt residuals or corrupt mode/motion vectors.
  102. * Better to use the predictor as reconstruction.
  103. */
  104. pbi->frame_corrupt_residual = 1;
  105. memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
  106. corruption_detected = 1;
  107. /* force idct to be skipped for B_PRED and use the
  108. * prediction only for reconstruction
  109. * */
  110. memset(xd->eobs, 0, 25);
  111. }
  112. }
  113. #endif
  114. /* do prediction */
  115. if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
  116. vp8_build_intra_predictors_mbuv_s(
  117. xd, xd->recon_above[1], xd->recon_above[2], xd->recon_left[1],
  118. xd->recon_left[2], xd->recon_left_stride[1], xd->dst.u_buffer,
  119. xd->dst.v_buffer, xd->dst.uv_stride);
  120. if (mode != B_PRED) {
  121. vp8_build_intra_predictors_mby_s(
  122. xd, xd->recon_above[0], xd->recon_left[0], xd->recon_left_stride[0],
  123. xd->dst.y_buffer, xd->dst.y_stride);
  124. } else {
  125. short *DQC = xd->dequant_y1;
  126. int dst_stride = xd->dst.y_stride;
  127. /* clear out residual eob info */
  128. if (xd->mode_info_context->mbmi.mb_skip_coeff) memset(xd->eobs, 0, 25);
  129. intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
  130. for (i = 0; i < 16; ++i) {
  131. BLOCKD *b = &xd->block[i];
  132. unsigned char *dst = xd->dst.y_buffer + b->offset;
  133. B_PREDICTION_MODE b_mode = xd->mode_info_context->bmi[i].as_mode;
  134. unsigned char *Above;
  135. unsigned char *yleft;
  136. int left_stride;
  137. unsigned char top_left;
  138. /*Caution: For some b_mode, it needs 8 pixels (4 above + 4
  139. * above-right).*/
  140. if (i < 4 && pbi->common.filter_level) {
  141. Above = xd->recon_above[0] + b->offset;
  142. } else {
  143. Above = dst - dst_stride;
  144. }
  145. if (i % 4 == 0 && pbi->common.filter_level) {
  146. yleft = xd->recon_left[0] + i;
  147. left_stride = 1;
  148. } else {
  149. yleft = dst - 1;
  150. left_stride = dst_stride;
  151. }
  152. if ((i == 4 || i == 8 || i == 12) && pbi->common.filter_level) {
  153. top_left = *(xd->recon_left[0] + i - 1);
  154. } else {
  155. top_left = Above[-1];
  156. }
  157. vp8_intra4x4_predict(Above, yleft, left_stride, b_mode, dst, dst_stride,
  158. top_left);
  159. if (xd->eobs[i]) {
  160. if (xd->eobs[i] > 1) {
  161. vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
  162. } else {
  163. vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0], dst, dst_stride, dst,
  164. dst_stride);
  165. memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
  166. }
  167. }
  168. }
  169. }
  170. } else {
  171. vp8_build_inter_predictors_mb(xd);
  172. }
  173. #if CONFIG_ERROR_CONCEALMENT
  174. if (corruption_detected) {
  175. return;
  176. }
  177. #endif
  178. if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
  179. /* dequantization and idct */
  180. if (mode != B_PRED) {
  181. short *DQC = xd->dequant_y1;
  182. if (mode != SPLITMV) {
  183. BLOCKD *b = &xd->block[24];
  184. /* do 2nd order transform on the dc block */
  185. if (xd->eobs[24] > 1) {
  186. vp8_dequantize_b(b, xd->dequant_y2);
  187. vp8_short_inv_walsh4x4(&b->dqcoeff[0], xd->qcoeff);
  188. memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
  189. } else {
  190. b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
  191. vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff);
  192. memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
  193. }
  194. /* override the dc dequant constant in order to preserve the
  195. * dc components
  196. */
  197. DQC = xd->dequant_y1_dc;
  198. }
  199. vp8_dequant_idct_add_y_block(xd->qcoeff, DQC, xd->dst.y_buffer,
  200. xd->dst.y_stride, xd->eobs);
  201. }
  202. vp8_dequant_idct_add_uv_block(xd->qcoeff + 16 * 16, xd->dequant_uv,
  203. xd->dst.u_buffer, xd->dst.v_buffer,
  204. xd->dst.uv_stride, xd->eobs + 16);
  205. }
  206. }
  207. static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
  208. int start_mb_row) {
  209. const vpx_atomic_int *last_row_current_mb_col;
  210. vpx_atomic_int *current_mb_col;
  211. int mb_row;
  212. VP8_COMMON *pc = &pbi->common;
  213. const int nsync = pbi->sync_range;
  214. const vpx_atomic_int first_row_no_sync_above =
  215. VPX_ATOMIC_INIT(pc->mb_cols + nsync);
  216. int num_part = 1 << pbi->common.multi_token_partition;
  217. int last_mb_row = start_mb_row;
  218. YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
  219. YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME];
  220. int recon_y_stride = yv12_fb_new->y_stride;
  221. int recon_uv_stride = yv12_fb_new->uv_stride;
  222. unsigned char *ref_buffer[MAX_REF_FRAMES][3];
  223. unsigned char *dst_buffer[3];
  224. int i;
  225. int ref_fb_corrupted[MAX_REF_FRAMES];
  226. ref_fb_corrupted[INTRA_FRAME] = 0;
  227. for (i = 1; i < MAX_REF_FRAMES; ++i) {
  228. YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
  229. ref_buffer[i][0] = this_fb->y_buffer;
  230. ref_buffer[i][1] = this_fb->u_buffer;
  231. ref_buffer[i][2] = this_fb->v_buffer;
  232. ref_fb_corrupted[i] = this_fb->corrupted;
  233. }
  234. dst_buffer[0] = yv12_fb_new->y_buffer;
  235. dst_buffer[1] = yv12_fb_new->u_buffer;
  236. dst_buffer[2] = yv12_fb_new->v_buffer;
  237. xd->up_available = (start_mb_row != 0);
  238. xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
  239. xd->mode_info_stride = pc->mode_info_stride;
  240. for (mb_row = start_mb_row; mb_row < pc->mb_rows;
  241. mb_row += (pbi->decoding_thread_count + 1)) {
  242. int recon_yoffset, recon_uvoffset;
  243. int mb_col;
  244. int filter_level;
  245. loop_filter_info_n *lfi_n = &pc->lf_info;
  246. /* save last row processed by this thread */
  247. last_mb_row = mb_row;
  248. /* select bool coder for current partition */
  249. xd->current_bc = &pbi->mbc[mb_row % num_part];
  250. if (mb_row > 0) {
  251. last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row - 1];
  252. } else {
  253. last_row_current_mb_col = &first_row_no_sync_above;
  254. }
  255. current_mb_col = &pbi->mt_current_mb_col[mb_row];
  256. recon_yoffset = mb_row * recon_y_stride * 16;
  257. recon_uvoffset = mb_row * recon_uv_stride * 8;
  258. /* reset contexts */
  259. xd->above_context = pc->above_context;
  260. memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
  261. xd->left_available = 0;
  262. xd->mb_to_top_edge = -((mb_row * 16) << 3);
  263. xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
  264. if (pbi->common.filter_level) {
  265. xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0 * 16 + 32;
  266. xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0 * 8 + 16;
  267. xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0 * 8 + 16;
  268. xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
  269. xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
  270. xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
  271. /* TODO: move to outside row loop */
  272. xd->recon_left_stride[0] = 1;
  273. xd->recon_left_stride[1] = 1;
  274. } else {
  275. xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
  276. xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
  277. xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
  278. xd->recon_left[0] = xd->recon_above[0] - 1;
  279. xd->recon_left[1] = xd->recon_above[1] - 1;
  280. xd->recon_left[2] = xd->recon_above[2] - 1;
  281. xd->recon_above[0] -= xd->dst.y_stride;
  282. xd->recon_above[1] -= xd->dst.uv_stride;
  283. xd->recon_above[2] -= xd->dst.uv_stride;
  284. /* TODO: move to outside row loop */
  285. xd->recon_left_stride[0] = xd->dst.y_stride;
  286. xd->recon_left_stride[1] = xd->dst.uv_stride;
  287. setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
  288. xd->recon_left[2], xd->dst.y_stride,
  289. xd->dst.uv_stride);
  290. }
  291. for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
  292. if (((mb_col - 1) % nsync) == 0) {
  293. vpx_atomic_store_release(current_mb_col, mb_col - 1);
  294. }
  295. if (mb_row && !(mb_col & (nsync - 1))) {
  296. vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
  297. }
  298. /* Distance of MB to the various image edges.
  299. * These are specified to 8th pel as they are always
  300. * compared to values that are in 1/8th pel units.
  301. */
  302. xd->mb_to_left_edge = -((mb_col * 16) << 3);
  303. xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
  304. #if CONFIG_ERROR_CONCEALMENT
  305. {
  306. int corrupt_residual =
  307. (!pbi->independent_partitions && pbi->frame_corrupt_residual) ||
  308. vp8dx_bool_error(xd->current_bc);
  309. if (pbi->ec_active &&
  310. (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
  311. corrupt_residual) {
  312. /* We have an intra block with corrupt
  313. * coefficients, better to conceal with an inter
  314. * block.
  315. * Interpolate MVs from neighboring MBs
  316. *
  317. * Note that for the first mb with corrupt
  318. * residual in a frame, we might not discover
  319. * that before decoding the residual. That
  320. * happens after this check, and therefore no
  321. * inter concealment will be done.
  322. */
  323. vp8_interpolate_motion(xd, mb_row, mb_col, pc->mb_rows, pc->mb_cols);
  324. }
  325. }
  326. #endif
  327. xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
  328. xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
  329. xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
  330. xd->pre.y_buffer =
  331. ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
  332. xd->pre.u_buffer =
  333. ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
  334. xd->pre.v_buffer =
  335. ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
  336. /* propagate errors from reference frames */
  337. xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
  338. mt_decode_macroblock(pbi, xd, 0);
  339. xd->left_available = 1;
  340. /* check if the boolean decoder has suffered an error */
  341. xd->corrupted |= vp8dx_bool_error(xd->current_bc);
  342. xd->recon_above[0] += 16;
  343. xd->recon_above[1] += 8;
  344. xd->recon_above[2] += 8;
  345. if (!pbi->common.filter_level) {
  346. xd->recon_left[0] += 16;
  347. xd->recon_left[1] += 8;
  348. xd->recon_left[2] += 8;
  349. }
  350. if (pbi->common.filter_level) {
  351. int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
  352. xd->mode_info_context->mbmi.mode != SPLITMV &&
  353. xd->mode_info_context->mbmi.mb_skip_coeff);
  354. const int mode_index =
  355. lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
  356. const int seg = xd->mode_info_context->mbmi.segment_id;
  357. const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
  358. filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
  359. if (mb_row != pc->mb_rows - 1) {
  360. /* Save decoded MB last row data for next-row decoding */
  361. memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col * 16),
  362. (xd->dst.y_buffer + 15 * recon_y_stride), 16);
  363. memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col * 8),
  364. (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
  365. memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col * 8),
  366. (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
  367. }
  368. /* save left_col for next MB decoding */
  369. if (mb_col != pc->mb_cols - 1) {
  370. MODE_INFO *next = xd->mode_info_context + 1;
  371. if (next->mbmi.ref_frame == INTRA_FRAME) {
  372. for (i = 0; i < 16; ++i) {
  373. pbi->mt_yleft_col[mb_row][i] =
  374. xd->dst.y_buffer[i * recon_y_stride + 15];
  375. }
  376. for (i = 0; i < 8; ++i) {
  377. pbi->mt_uleft_col[mb_row][i] =
  378. xd->dst.u_buffer[i * recon_uv_stride + 7];
  379. pbi->mt_vleft_col[mb_row][i] =
  380. xd->dst.v_buffer[i * recon_uv_stride + 7];
  381. }
  382. }
  383. }
  384. /* loopfilter on this macroblock. */
  385. if (filter_level) {
  386. if (pc->filter_type == NORMAL_LOOPFILTER) {
  387. loop_filter_info lfi;
  388. FRAME_TYPE frame_type = pc->frame_type;
  389. const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
  390. lfi.mblim = lfi_n->mblim[filter_level];
  391. lfi.blim = lfi_n->blim[filter_level];
  392. lfi.lim = lfi_n->lim[filter_level];
  393. lfi.hev_thr = lfi_n->hev_thr[hev_index];
  394. if (mb_col > 0)
  395. vp8_loop_filter_mbv(xd->dst.y_buffer, xd->dst.u_buffer,
  396. xd->dst.v_buffer, recon_y_stride,
  397. recon_uv_stride, &lfi);
  398. if (!skip_lf)
  399. vp8_loop_filter_bv(xd->dst.y_buffer, xd->dst.u_buffer,
  400. xd->dst.v_buffer, recon_y_stride,
  401. recon_uv_stride, &lfi);
  402. /* don't apply across umv border */
  403. if (mb_row > 0)
  404. vp8_loop_filter_mbh(xd->dst.y_buffer, xd->dst.u_buffer,
  405. xd->dst.v_buffer, recon_y_stride,
  406. recon_uv_stride, &lfi);
  407. if (!skip_lf)
  408. vp8_loop_filter_bh(xd->dst.y_buffer, xd->dst.u_buffer,
  409. xd->dst.v_buffer, recon_y_stride,
  410. recon_uv_stride, &lfi);
  411. } else {
  412. if (mb_col > 0)
  413. vp8_loop_filter_simple_mbv(xd->dst.y_buffer, recon_y_stride,
  414. lfi_n->mblim[filter_level]);
  415. if (!skip_lf)
  416. vp8_loop_filter_simple_bv(xd->dst.y_buffer, recon_y_stride,
  417. lfi_n->blim[filter_level]);
  418. /* don't apply across umv border */
  419. if (mb_row > 0)
  420. vp8_loop_filter_simple_mbh(xd->dst.y_buffer, recon_y_stride,
  421. lfi_n->mblim[filter_level]);
  422. if (!skip_lf)
  423. vp8_loop_filter_simple_bh(xd->dst.y_buffer, recon_y_stride,
  424. lfi_n->blim[filter_level]);
  425. }
  426. }
  427. }
  428. recon_yoffset += 16;
  429. recon_uvoffset += 8;
  430. ++xd->mode_info_context; /* next mb */
  431. xd->above_context++;
  432. }
  433. /* adjust to the next row of mbs */
  434. if (pbi->common.filter_level) {
  435. if (mb_row != pc->mb_rows - 1) {
  436. int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS;
  437. int lastuv = (yv12_fb_lst->y_width >> 1) + (VP8BORDERINPIXELS >> 1);
  438. for (i = 0; i < 4; ++i) {
  439. pbi->mt_yabove_row[mb_row + 1][lasty + i] =
  440. pbi->mt_yabove_row[mb_row + 1][lasty - 1];
  441. pbi->mt_uabove_row[mb_row + 1][lastuv + i] =
  442. pbi->mt_uabove_row[mb_row + 1][lastuv - 1];
  443. pbi->mt_vabove_row[mb_row + 1][lastuv + i] =
  444. pbi->mt_vabove_row[mb_row + 1][lastuv - 1];
  445. }
  446. }
  447. } else {
  448. vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
  449. xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
  450. }
  451. /* last MB of row is ready just after extension is done */
  452. vpx_atomic_store_release(current_mb_col, mb_col + nsync);
  453. ++xd->mode_info_context; /* skip prediction column */
  454. xd->up_available = 1;
  455. /* since we have multithread */
  456. xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
  457. }
  458. /* signal end of frame decoding if this thread processed the last mb_row */
  459. if (last_mb_row == (pc->mb_rows - 1)) sem_post(&pbi->h_event_end_decoding);
  460. }
  461. static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
  462. int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
  463. VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
  464. MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
  465. ENTROPY_CONTEXT_PLANES mb_row_left_context;
  466. while (1) {
  467. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
  468. if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
  469. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
  470. break;
  471. } else {
  472. MACROBLOCKD *xd = &mbrd->mbd;
  473. xd->left_context = &mb_row_left_context;
  474. mt_decode_mb_rows(pbi, xd, ithread + 1);
  475. }
  476. }
  477. }
  478. return 0;
  479. }
  480. void vp8_decoder_create_threads(VP8D_COMP *pbi) {
  481. int core_count = 0;
  482. unsigned int ithread;
  483. vpx_atomic_init(&pbi->b_multithreaded_rd, 0);
  484. pbi->allocated_decoding_thread_count = 0;
  485. /* limit decoding threads to the max number of token partitions */
  486. core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
  487. /* limit decoding threads to the available cores */
  488. if (core_count > pbi->common.processor_core_count) {
  489. core_count = pbi->common.processor_core_count;
  490. }
  491. if (core_count > 1) {
  492. vpx_atomic_init(&pbi->b_multithreaded_rd, 1);
  493. pbi->decoding_thread_count = core_count - 1;
  494. CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
  495. CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
  496. CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
  497. CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
  498. if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
  499. vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
  500. "Failed to initialize semaphore");
  501. }
  502. for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
  503. if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
  504. vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
  505. pbi->de_thread_data[ithread].ithread = ithread;
  506. pbi->de_thread_data[ithread].ptr1 = (void *)pbi;
  507. pbi->de_thread_data[ithread].ptr2 = (void *)&pbi->mb_row_di[ithread];
  508. if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
  509. thread_decoding_proc, &pbi->de_thread_data[ithread])) {
  510. sem_destroy(&pbi->h_event_start_decoding[ithread]);
  511. break;
  512. }
  513. }
  514. pbi->allocated_decoding_thread_count = ithread;
  515. if (pbi->allocated_decoding_thread_count !=
  516. (int)pbi->decoding_thread_count) {
  517. /* the remainder of cleanup cases will be handled in
  518. * vp8_decoder_remove_threads(). */
  519. if (pbi->allocated_decoding_thread_count == 0) {
  520. sem_destroy(&pbi->h_event_end_decoding);
  521. }
  522. vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
  523. "Failed to create threads");
  524. }
  525. }
  526. }
  527. void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
  528. int i;
  529. vpx_free(pbi->mt_current_mb_col);
  530. pbi->mt_current_mb_col = NULL;
  531. /* Free above_row buffers. */
  532. if (pbi->mt_yabove_row) {
  533. for (i = 0; i < mb_rows; ++i) {
  534. vpx_free(pbi->mt_yabove_row[i]);
  535. pbi->mt_yabove_row[i] = NULL;
  536. }
  537. vpx_free(pbi->mt_yabove_row);
  538. pbi->mt_yabove_row = NULL;
  539. }
  540. if (pbi->mt_uabove_row) {
  541. for (i = 0; i < mb_rows; ++i) {
  542. vpx_free(pbi->mt_uabove_row[i]);
  543. pbi->mt_uabove_row[i] = NULL;
  544. }
  545. vpx_free(pbi->mt_uabove_row);
  546. pbi->mt_uabove_row = NULL;
  547. }
  548. if (pbi->mt_vabove_row) {
  549. for (i = 0; i < mb_rows; ++i) {
  550. vpx_free(pbi->mt_vabove_row[i]);
  551. pbi->mt_vabove_row[i] = NULL;
  552. }
  553. vpx_free(pbi->mt_vabove_row);
  554. pbi->mt_vabove_row = NULL;
  555. }
  556. /* Free left_col buffers. */
  557. if (pbi->mt_yleft_col) {
  558. for (i = 0; i < mb_rows; ++i) {
  559. vpx_free(pbi->mt_yleft_col[i]);
  560. pbi->mt_yleft_col[i] = NULL;
  561. }
  562. vpx_free(pbi->mt_yleft_col);
  563. pbi->mt_yleft_col = NULL;
  564. }
  565. if (pbi->mt_uleft_col) {
  566. for (i = 0; i < mb_rows; ++i) {
  567. vpx_free(pbi->mt_uleft_col[i]);
  568. pbi->mt_uleft_col[i] = NULL;
  569. }
  570. vpx_free(pbi->mt_uleft_col);
  571. pbi->mt_uleft_col = NULL;
  572. }
  573. if (pbi->mt_vleft_col) {
  574. for (i = 0; i < mb_rows; ++i) {
  575. vpx_free(pbi->mt_vleft_col[i]);
  576. pbi->mt_vleft_col[i] = NULL;
  577. }
  578. vpx_free(pbi->mt_vleft_col);
  579. pbi->mt_vleft_col = NULL;
  580. }
  581. }
  582. void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
  583. VP8_COMMON *const pc = &pbi->common;
  584. int i;
  585. int uv_width;
  586. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
  587. vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
  588. /* our internal buffers are always multiples of 16 */
  589. if ((width & 0xf) != 0) width += 16 - (width & 0xf);
  590. if (width < 640) {
  591. pbi->sync_range = 1;
  592. } else if (width <= 1280) {
  593. pbi->sync_range = 8;
  594. } else if (width <= 2560) {
  595. pbi->sync_range = 16;
  596. } else {
  597. pbi->sync_range = 32;
  598. }
  599. uv_width = width >> 1;
  600. /* Allocate a vpx_atomic_int for each mb row. */
  601. CHECK_MEM_ERROR(pbi->mt_current_mb_col,
  602. vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
  603. for (i = 0; i < pc->mb_rows; ++i)
  604. vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
  605. /* Allocate memory for above_row buffers. */
  606. CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
  607. for (i = 0; i < pc->mb_rows; ++i)
  608. CHECK_MEM_ERROR(
  609. pbi->mt_yabove_row[i],
  610. vpx_memalign(
  611. 16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
  612. CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
  613. for (i = 0; i < pc->mb_rows; ++i)
  614. CHECK_MEM_ERROR(
  615. pbi->mt_uabove_row[i],
  616. vpx_memalign(16,
  617. sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
  618. CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
  619. for (i = 0; i < pc->mb_rows; ++i)
  620. CHECK_MEM_ERROR(
  621. pbi->mt_vabove_row[i],
  622. vpx_memalign(16,
  623. sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
  624. /* Allocate memory for left_col buffers. */
  625. CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
  626. for (i = 0; i < pc->mb_rows; ++i)
  627. CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
  628. vpx_calloc(sizeof(unsigned char) * 16, 1));
  629. CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
  630. for (i = 0; i < pc->mb_rows; ++i)
  631. CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
  632. vpx_calloc(sizeof(unsigned char) * 8, 1));
  633. CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
  634. for (i = 0; i < pc->mb_rows; ++i)
  635. CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
  636. vpx_calloc(sizeof(unsigned char) * 8, 1));
  637. }
  638. }
  639. void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
  640. /* shutdown MB Decoding thread; */
  641. if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
  642. int i;
  643. vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0);
  644. /* allow all threads to exit */
  645. for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
  646. sem_post(&pbi->h_event_start_decoding[i]);
  647. pthread_join(pbi->h_decoding_thread[i], NULL);
  648. }
  649. for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
  650. sem_destroy(&pbi->h_event_start_decoding[i]);
  651. }
  652. if (pbi->allocated_decoding_thread_count) {
  653. sem_destroy(&pbi->h_event_end_decoding);
  654. }
  655. vpx_free(pbi->h_decoding_thread);
  656. pbi->h_decoding_thread = NULL;
  657. vpx_free(pbi->h_event_start_decoding);
  658. pbi->h_event_start_decoding = NULL;
  659. vpx_free(pbi->mb_row_di);
  660. pbi->mb_row_di = NULL;
  661. vpx_free(pbi->de_thread_data);
  662. pbi->de_thread_data = NULL;
  663. vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
  664. }
  665. }
  666. void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
  667. VP8_COMMON *pc = &pbi->common;
  668. unsigned int i;
  669. int j;
  670. int filter_level = pc->filter_level;
  671. YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
  672. if (filter_level) {
  673. /* Set above_row buffer to 127 for decoding first MB row */
  674. memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS - 1, 127,
  675. yv12_fb_new->y_width + 5);
  676. memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS >> 1) - 1, 127,
  677. (yv12_fb_new->y_width >> 1) + 5);
  678. memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS >> 1) - 1, 127,
  679. (yv12_fb_new->y_width >> 1) + 5);
  680. for (j = 1; j < pc->mb_rows; ++j) {
  681. memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS - 1, (unsigned char)129,
  682. 1);
  683. memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS >> 1) - 1,
  684. (unsigned char)129, 1);
  685. memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS >> 1) - 1,
  686. (unsigned char)129, 1);
  687. }
  688. /* Set left_col to 129 initially */
  689. for (j = 0; j < pc->mb_rows; ++j) {
  690. memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
  691. memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
  692. memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
  693. }
  694. /* Initialize the loop filter for this frame. */
  695. vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level);
  696. } else {
  697. vp8_setup_intra_recon_top_line(yv12_fb_new);
  698. }
  699. setup_decoding_thread_data(pbi, xd, pbi->mb_row_di,
  700. pbi->decoding_thread_count);
  701. for (i = 0; i < pbi->decoding_thread_count; ++i) {
  702. sem_post(&pbi->h_event_start_decoding[i]);
  703. }
  704. mt_decode_mb_rows(pbi, xd, 0);
  705. sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
  706. }