1
0

transform8x8.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. /*!
  2. ***************************************************************************
  3. * \file transform8x8.c
  4. *
  5. * \brief
  6. * 8x8 transform functions
  7. *
  8. * \author
  9. * Main contributors (see contributors.h for copyright, address and affiliation details)
  10. * - Yuri Vatis
  11. * - Jan Muenster
  12. *
  13. * \date
  14. * 12. October 2003
  15. **************************************************************************
  16. */
  17. #include "global.h"
  18. #include "image.h"
  19. #include "mb_access.h"
  20. #include "elements.h"
  21. #include "transform8x8.h"
  22. #include "transform.h"
  23. #include "quant.h"
  24. #include <emmintrin.h>
  25. static void inverse8x8_sse2(h264_short_8x8block_row_t *block)
  26. {
  27. __m128i a0, a1, a2, a3;
  28. __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
  29. __m128i b0, b1, b2, b3, b4, b5, b6, b7;
  30. __m128i r0, r1, r2, r3, r4, r5, r6, r7;
  31. // Horizontal
  32. b0 = _mm_load_si128((__m128i *)(block[0]));
  33. b1 = _mm_load_si128((__m128i *)(block[1]));
  34. b2 = _mm_load_si128((__m128i *)(block[2]));
  35. b3 = _mm_load_si128((__m128i *)(block[3]));
  36. b4 = _mm_load_si128((__m128i *)(block[4]));
  37. b5 = _mm_load_si128((__m128i *)(block[5]));
  38. b6 = _mm_load_si128((__m128i *)(block[6]));
  39. b7 = _mm_load_si128((__m128i *)(block[7]));
  40. /* rotate 8x8 (ugh) */
  41. r0 = _mm_unpacklo_epi16(b0, b2);
  42. r1 = _mm_unpacklo_epi16(b1, b3);
  43. r2 = _mm_unpackhi_epi16(b0, b2);
  44. r3 = _mm_unpackhi_epi16(b1, b3);
  45. r4 = _mm_unpacklo_epi16(b4, b6);
  46. r5 = _mm_unpacklo_epi16(b5, b7);
  47. r6 = _mm_unpackhi_epi16(b4, b6);
  48. r7 = _mm_unpackhi_epi16(b5, b7);
  49. b0 = _mm_unpacklo_epi16(r0, r1);
  50. b1 = _mm_unpackhi_epi16(r0, r1);
  51. b2 = _mm_unpacklo_epi16(r2, r3);
  52. b3 = _mm_unpackhi_epi16(r2, r3);
  53. b4 = _mm_unpacklo_epi16(r4, r5);
  54. b5 = _mm_unpackhi_epi16(r4, r5);
  55. b6 = _mm_unpacklo_epi16(r6, r7);
  56. b7 = _mm_unpackhi_epi16(r6, r7);
  57. p0 = _mm_unpacklo_epi64(b0, b4);
  58. p1 = _mm_unpackhi_epi64(b0, b4);
  59. p2 = _mm_unpacklo_epi64(b1, b5);
  60. p3 = _mm_unpackhi_epi64(b1, b5);
  61. p4 = _mm_unpacklo_epi64(b2, b6);
  62. p5 = _mm_unpackhi_epi64(b2, b6);
  63. p6 = _mm_unpacklo_epi64(b3, b7);
  64. p7 = _mm_unpackhi_epi64(b3, b7);
  65. /* perform approx DCT */
  66. a0 = _mm_add_epi16(p0, p4); // p0 + p4
  67. a1 = _mm_sub_epi16(p0, p4); // p0 - p4
  68. r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
  69. a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
  70. r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
  71. a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
  72. b0 = _mm_add_epi16(a0, a3); // a0 + a3;
  73. b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
  74. b4 = _mm_add_epi16(a1, a2); // a1 + a2;
  75. b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
  76. //-p3 + p5 - p7 - (p7 >> 1);
  77. r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
  78. a0 = _mm_sub_epi16(p5, p3); // p5 - p3
  79. a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
  80. a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
  81. //p1 + p7 - p3 - (p3 >> 1);
  82. r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
  83. a1 = _mm_add_epi16(p1, p7); // p1 + p7
  84. a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
  85. a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
  86. // -p1 + p7 + p5 + (p5 >> 1);
  87. r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
  88. a2 = _mm_sub_epi16(p7, p1); // p7 - p1
  89. a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
  90. a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
  91. // p3 + p5 + p1 + (p1 >> 1);
  92. a3 = _mm_add_epi16(p3, p5); // p3+p5
  93. a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
  94. p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
  95. a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
  96. r0 = _mm_srai_epi16(a3, 2); // a3>>2
  97. b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
  98. r0 = _mm_srai_epi16(a2, 2); // a2>>2
  99. b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
  100. a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
  101. b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
  102. a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
  103. b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
  104. p0 = _mm_add_epi16(b0, b7); // b0 + b7;
  105. p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
  106. p2 = _mm_add_epi16(b4, b3); // b4 + b3;
  107. p3 = _mm_add_epi16(b6, b1); // b6 + b1;
  108. p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
  109. p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
  110. p6 = _mm_add_epi16(b2, b5); // b2 + b5;
  111. p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
  112. /* rotate 8x8 (ugh) */
  113. r0 = _mm_unpacklo_epi16(p0, p2);
  114. r1 = _mm_unpacklo_epi16(p1, p3);
  115. r2 = _mm_unpackhi_epi16(p0, p2);
  116. r3 = _mm_unpackhi_epi16(p1, p3);
  117. r4 = _mm_unpacklo_epi16(p4, p6);
  118. r5 = _mm_unpacklo_epi16(p5, p7);
  119. r6 = _mm_unpackhi_epi16(p4, p6);
  120. r7 = _mm_unpackhi_epi16(p5, p7);
  121. b0 = _mm_unpacklo_epi16(r0, r1);
  122. b1 = _mm_unpackhi_epi16(r0, r1);
  123. b2 = _mm_unpacklo_epi16(r2, r3);
  124. b3 = _mm_unpackhi_epi16(r2, r3);
  125. b4 = _mm_unpacklo_epi16(r4, r5);
  126. b5 = _mm_unpackhi_epi16(r4, r5);
  127. b6 = _mm_unpacklo_epi16(r6, r7);
  128. b7 = _mm_unpackhi_epi16(r6, r7);
  129. p0 = _mm_unpacklo_epi64(b0, b4);
  130. p1 = _mm_unpackhi_epi64(b0, b4);
  131. p2 = _mm_unpacklo_epi64(b1, b5);
  132. p3 = _mm_unpackhi_epi64(b1, b5);
  133. p4 = _mm_unpacklo_epi64(b2, b6);
  134. p5 = _mm_unpackhi_epi64(b2, b6);
  135. p6 = _mm_unpacklo_epi64(b3, b7);
  136. p7 = _mm_unpackhi_epi64(b3, b7);
  137. /* Vertical */
  138. a0 = _mm_add_epi16(p0, p4); // p0 + p4
  139. a1 = _mm_sub_epi16(p0, p4); // p0 - p4
  140. r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
  141. a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
  142. r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
  143. a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
  144. b0 = _mm_add_epi16(a0, a3); // a0 + a3;
  145. b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
  146. b4 = _mm_add_epi16(a1, a2); // a1 + a2;
  147. b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
  148. //-p3 + p5 - p7 - (p7 >> 1);
  149. r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
  150. a0 = _mm_sub_epi16(p5, p3); // p5 - p3
  151. a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
  152. a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
  153. //p1 + p7 - p3 - (p3 >> 1);
  154. r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
  155. a1 = _mm_add_epi16(p1, p7); // p1 + p7
  156. a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
  157. a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
  158. // -p1 + p7 + p5 + (p5 >> 1);
  159. r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
  160. a2 = _mm_sub_epi16(p7, p1); // p7 - p1
  161. a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
  162. a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
  163. // p3 + p5 + p1 + (p1 >> 1);
  164. r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
  165. a3 = _mm_add_epi16(p3, p5); // p3+p5
  166. a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
  167. a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
  168. r0 = _mm_srai_epi16(a3, 2); // a3>>2
  169. b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
  170. r0 = _mm_srai_epi16(a2, 2); // a2>>2
  171. b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
  172. a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
  173. b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
  174. a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
  175. b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
  176. r0 = _mm_add_epi16(b0, b7); // b0 + b7;
  177. _mm_store_si128((__m128i *)(block[0]), r0);
  178. r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
  179. _mm_store_si128((__m128i *)(block[1]), r1);
  180. r2 = _mm_add_epi16(b4, b3); // b4 + b3;
  181. _mm_store_si128((__m128i *)(block[2]), r2);
  182. r3 = _mm_add_epi16(b6, b1); // b6 + b1;
  183. _mm_store_si128((__m128i *)(block[3]), r3);
  184. r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
  185. _mm_store_si128((__m128i *)(block[4]), r4);
  186. r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
  187. _mm_store_si128((__m128i *)(block[5]), r5);
  188. r6 = _mm_add_epi16(b2, b5); // b2 + b5;
  189. _mm_store_si128((__m128i *)(block[6]), r6);
  190. r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
  191. _mm_store_si128((__m128i *)(block[7]), r7);
  192. }
  193. static void inverse8x8(h264_short_8x8block_row_t *block)
  194. {
  195. int i;
  196. //int tmp[64];
  197. //int *pTmp = tmp;
  198. int a0, a1, a2, a3;
  199. int p0, p1, p2, p3, p4, p5 ,p6, p7;
  200. int b0, b1, b2, b3, b4, b5, b6, b7;
  201. // Horizontal
  202. for (i=0; i < BLOCK_SIZE_8x8; i++)
  203. {
  204. p0 = block[i][0];
  205. p1 = block[i][1];
  206. p2 = block[i][2];
  207. p3 = block[i][3];
  208. p4 = block[i][4];
  209. p5 = block[i][5];
  210. p6 = block[i][6];
  211. p7 = block[i][7];
  212. a0 = p0 + p4;
  213. a1 = p0 - p4;
  214. a2 = p6 - (p2 >> 1);
  215. a3 = p2 + (p6 >> 1);
  216. b0 = a0 + a3;
  217. b2 = a1 - a2;
  218. b4 = a1 + a2;
  219. b6 = a0 - a3;
  220. a0 = p5 - p3 - p7 - (p7 >> 1);
  221. a1 = p1 + p7 - p3 - (p3 >> 1);
  222. a2 = p7 - p1 + p5 + (p5 >> 1);
  223. a3 = p3 + p5 + p1 + (p1 >> 1);
  224. b1 = a0 + (a3>>2);
  225. b3 = a1 + (a2>>2);
  226. b5 = a2 - (a1>>2);
  227. b7 = a3 - (a0>>2);
  228. block[i][0] = b0 + b7;
  229. block[i][1] = b2 - b5;
  230. block[i][2] = b4 + b3;
  231. block[i][3] = b6 + b1;
  232. block[i][4] = b6 - b1;
  233. block[i][5] = b4 - b3;
  234. block[i][6] = b2 + b5;
  235. block[i][7] = b0 - b7;
  236. }
  237. // Vertical
  238. for (i=0; i < BLOCK_SIZE_8x8; i++)
  239. {
  240. // pTmp = tmp + i;
  241. p0 = block[0][i];
  242. p1 = block[1][i];
  243. p2 = block[2][i];
  244. p3 = block[3][i];
  245. p4 = block[4][i];
  246. p5 = block[5][i];
  247. p6 = block[6][i];
  248. p7 = block[7][i];
  249. a0 = p0 + p4;
  250. a1 = p0 - p4;
  251. a2 = p6 - (p2>>1);
  252. a3 = p2 + (p6>>1);
  253. b0 = a0 + a3;
  254. b2 = a1 - a2;
  255. b4 = a1 + a2;
  256. b6 = a0 - a3;
  257. a0 = -p3 + p5 - p7 - (p7 >> 1);
  258. a1 = p1 + p7 - p3 - (p3 >> 1);
  259. a2 = -p1 + p7 + p5 + (p5 >> 1);
  260. a3 = p3 + p5 + p1 + (p1 >> 1);
  261. b1 = a0 + (a3 >> 2);
  262. b7 = a3 - (a0 >> 2);
  263. b3 = a1 + (a2 >> 2);
  264. b5 = a2 - (a1 >> 2);
  265. block[0][i] = b0 + b7;
  266. block[1][i] = b2 - b5;
  267. block[2][i] = b4 + b3;
  268. block[3][i] = b6 + b1;
  269. block[4][i] = b6 - b1;
  270. block[5][i] = b4 - b3;
  271. block[6][i] = b2 + b5;
  272. block[7][i] = b0 - b7;
  273. }
  274. }
  275. #if defined(_DEBUG) || defined(_M_IX64)
  276. void itrans8x8_sse2(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
  277. {
  278. __m128i a0, a1, a2, a3;
  279. __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
  280. __m128i b0, b1, b2, b3, b4, b5, b6, b7;
  281. __m128i r0, r1, r2, r3, r4, r5, r6, r7;
  282. __m128i const32, zero;
  283. __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
  284. __m128i pred0, pred1;
  285. const32 = _mm_load_si128((const __m128i *)c32);
  286. zero = _mm_setzero_si128();
  287. // Horizontal
  288. b0 = _mm_load_si128((__m128i *)(block[0]));
  289. b1 = _mm_load_si128((__m128i *)(block[1]));
  290. b2 = _mm_load_si128((__m128i *)(block[2]));
  291. b3 = _mm_load_si128((__m128i *)(block[3]));
  292. b4 = _mm_load_si128((__m128i *)(block[4]));
  293. b5 = _mm_load_si128((__m128i *)(block[5]));
  294. b6 = _mm_load_si128((__m128i *)(block[6]));
  295. b7 = _mm_load_si128((__m128i *)(block[7]));
  296. /* rotate 8x8 (ugh) */
  297. r0 = _mm_unpacklo_epi16(b0, b2);
  298. r1 = _mm_unpacklo_epi16(b1, b3);
  299. r2 = _mm_unpackhi_epi16(b0, b2);
  300. r3 = _mm_unpackhi_epi16(b1, b3);
  301. r4 = _mm_unpacklo_epi16(b4, b6);
  302. r5 = _mm_unpacklo_epi16(b5, b7);
  303. r6 = _mm_unpackhi_epi16(b4, b6);
  304. r7 = _mm_unpackhi_epi16(b5, b7);
  305. b0 = _mm_unpacklo_epi16(r0, r1);
  306. b1 = _mm_unpackhi_epi16(r0, r1);
  307. b2 = _mm_unpacklo_epi16(r2, r3);
  308. b3 = _mm_unpackhi_epi16(r2, r3);
  309. b4 = _mm_unpacklo_epi16(r4, r5);
  310. b5 = _mm_unpackhi_epi16(r4, r5);
  311. b6 = _mm_unpacklo_epi16(r6, r7);
  312. b7 = _mm_unpackhi_epi16(r6, r7);
  313. p0 = _mm_unpacklo_epi64(b0, b4);
  314. p1 = _mm_unpackhi_epi64(b0, b4);
  315. p2 = _mm_unpacklo_epi64(b1, b5);
  316. p3 = _mm_unpackhi_epi64(b1, b5);
  317. p4 = _mm_unpacklo_epi64(b2, b6);
  318. p5 = _mm_unpackhi_epi64(b2, b6);
  319. p6 = _mm_unpacklo_epi64(b3, b7);
  320. p7 = _mm_unpackhi_epi64(b3, b7);
  321. /* perform approx DCT */
  322. a0 = _mm_add_epi16(p0, p4); // p0 + p4
  323. a1 = _mm_sub_epi16(p0, p4); // p0 - p4
  324. r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
  325. a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
  326. r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
  327. a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
  328. b0 = _mm_add_epi16(a0, a3); // a0 + a3;
  329. b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
  330. b4 = _mm_add_epi16(a1, a2); // a1 + a2;
  331. b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
  332. //-p3 + p5 - p7 - (p7 >> 1);
  333. r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
  334. a0 = _mm_sub_epi16(p5, p3); // p5 - p3
  335. a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
  336. a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
  337. //p1 + p7 - p3 - (p3 >> 1);
  338. r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
  339. a1 = _mm_add_epi16(p1, p7); // p1 + p7
  340. a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
  341. a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
  342. // -p1 + p7 + p5 + (p5 >> 1);
  343. r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
  344. a2 = _mm_sub_epi16(p7, p1); // p7 - p1
  345. a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
  346. a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
  347. // p3 + p5 + p1 + (p1 >> 1);
  348. a3 = _mm_add_epi16(p3, p5); // p3+p5
  349. a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
  350. p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
  351. a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
  352. r0 = _mm_srai_epi16(a3, 2); // a3>>2
  353. b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
  354. r0 = _mm_srai_epi16(a2, 2); // a2>>2
  355. b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
  356. a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
  357. b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
  358. a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
  359. b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
  360. p0 = _mm_add_epi16(b0, b7); // b0 + b7;
  361. p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
  362. p2 = _mm_add_epi16(b4, b3); // b4 + b3;
  363. p3 = _mm_add_epi16(b6, b1); // b6 + b1;
  364. p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
  365. p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
  366. p6 = _mm_add_epi16(b2, b5); // b2 + b5;
  367. p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
  368. /* rotate 8x8 (ugh) */
  369. r0 = _mm_unpacklo_epi16(p0, p2);
  370. r1 = _mm_unpacklo_epi16(p1, p3);
  371. r2 = _mm_unpackhi_epi16(p0, p2);
  372. r3 = _mm_unpackhi_epi16(p1, p3);
  373. r4 = _mm_unpacklo_epi16(p4, p6);
  374. r5 = _mm_unpacklo_epi16(p5, p7);
  375. r6 = _mm_unpackhi_epi16(p4, p6);
  376. r7 = _mm_unpackhi_epi16(p5, p7);
  377. b0 = _mm_unpacklo_epi16(r0, r1);
  378. b1 = _mm_unpackhi_epi16(r0, r1);
  379. b2 = _mm_unpacklo_epi16(r2, r3);
  380. b3 = _mm_unpackhi_epi16(r2, r3);
  381. b4 = _mm_unpacklo_epi16(r4, r5);
  382. b5 = _mm_unpackhi_epi16(r4, r5);
  383. b6 = _mm_unpacklo_epi16(r6, r7);
  384. b7 = _mm_unpackhi_epi16(r6, r7);
  385. p0 = _mm_unpacklo_epi64(b0, b4);
  386. p1 = _mm_unpackhi_epi64(b0, b4);
  387. p2 = _mm_unpacklo_epi64(b1, b5);
  388. p3 = _mm_unpackhi_epi64(b1, b5);
  389. p4 = _mm_unpacklo_epi64(b2, b6);
  390. p5 = _mm_unpackhi_epi64(b2, b6);
  391. p6 = _mm_unpacklo_epi64(b3, b7);
  392. p7 = _mm_unpackhi_epi64(b3, b7);
  393. /* Vertical */
  394. a0 = _mm_add_epi16(p0, p4); // p0 + p4
  395. a1 = _mm_sub_epi16(p0, p4); // p0 - p4
  396. r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
  397. a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
  398. r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
  399. a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
  400. b0 = _mm_add_epi16(a0, a3); // a0 + a3;
  401. b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
  402. b4 = _mm_add_epi16(a1, a2); // a1 + a2;
  403. b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
  404. //-p3 + p5 - p7 - (p7 >> 1);
  405. r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
  406. a0 = _mm_sub_epi16(p5, p3); // p5 - p3
  407. a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
  408. a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
  409. //p1 + p7 - p3 - (p3 >> 1);
  410. r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
  411. a1 = _mm_add_epi16(p1, p7); // p1 + p7
  412. a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
  413. a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
  414. // -p1 + p7 + p5 + (p5 >> 1);
  415. r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
  416. a2 = _mm_sub_epi16(p7, p1); // p7 - p1
  417. a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
  418. a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
  419. // p3 + p5 + p1 + (p1 >> 1);
  420. r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
  421. a3 = _mm_add_epi16(p3, p5); // p3+p5
  422. a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
  423. a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
  424. r0 = _mm_srai_epi16(a3, 2); // a3>>2
  425. b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
  426. r0 = _mm_srai_epi16(a2, 2); // a2>>2
  427. b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
  428. a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
  429. b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
  430. a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
  431. b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
  432. r0 = _mm_add_epi16(b0, b7); // b0 + b7;
  433. r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
  434. r2 = _mm_add_epi16(b4, b3); // b4 + b3;
  435. r3 = _mm_add_epi16(b6, b1); // b6 + b1;
  436. r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
  437. r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
  438. r6 = _mm_add_epi16(b2, b5); // b2 + b5;
  439. r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
  440. // add in prediction values
  441. pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
  442. pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
  443. // (x + 32) >> 6
  444. r0 = _mm_adds_epi16(r0, const32);
  445. r0 = _mm_srai_epi16(r0, 6);
  446. r1 = _mm_adds_epi16(r1, const32);
  447. r1 = _mm_srai_epi16(r1, 6);
  448. pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  449. pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  450. pred0 = _mm_adds_epi16(pred0, r0);
  451. pred1 = _mm_adds_epi16(pred1, r1);
  452. pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  453. // store
  454. _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
  455. // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  456. pred0 = _mm_srli_si128(pred0, 8);
  457. _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
  458. /* --- */
  459. pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
  460. pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
  461. // (x + 32) >> 6
  462. r2 = _mm_adds_epi16(r2, const32);
  463. r2 = _mm_srai_epi16(r2, 6);
  464. r3 = _mm_adds_epi16(r3, const32);
  465. r3 = _mm_srai_epi16(r3, 6);
  466. pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  467. pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  468. pred0 = _mm_adds_epi16(pred0, r2);
  469. pred1 = _mm_adds_epi16(pred1, r3);
  470. pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  471. // store
  472. _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
  473. // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  474. pred0 = _mm_srli_si128(pred0, 8);
  475. _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
  476. /* --- */
  477. pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
  478. pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
  479. // (x + 32) >> 6
  480. r4 = _mm_adds_epi16(r4, const32);
  481. r4 = _mm_srai_epi16(r4, 6);
  482. r5 = _mm_adds_epi16(r5, const32);
  483. r5 = _mm_srai_epi16(r5, 6);
  484. pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  485. pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  486. pred0 = _mm_adds_epi16(pred0, r4);
  487. pred1 = _mm_adds_epi16(pred1, r5);
  488. pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  489. // store
  490. _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
  491. // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  492. pred0 = _mm_srli_si128(pred0, 8);
  493. _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
  494. /* --- */
  495. pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
  496. pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
  497. // (x + 32) >> 6
  498. r6 = _mm_adds_epi16(r6, const32);
  499. r6 = _mm_srai_epi16(r6, 6);
  500. r7 = _mm_adds_epi16(r7, const32);
  501. r7 = _mm_srai_epi16(r7, 6);
  502. pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  503. pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  504. pred0 = _mm_adds_epi16(pred0, r6);
  505. pred1 = _mm_adds_epi16(pred1, r7);
  506. pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  507. // store
  508. _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
  509. // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  510. pred0 = _mm_srli_si128(pred0, 8);
  511. _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
  512. }
  513. #endif
  514. #ifdef _M_IX86
  515. // TODO!! fix for 16bit coefficients instead of 32
  516. static void sample_reconstruct8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *mb_rres8, int pos_x)
  517. {
  518. __asm
  519. {
  520. mov esi, 8 // loop 8 times
  521. mov eax, mb_rec
  522. add eax, pos_x
  523. mov ebx, mb_pred
  524. add ebx, pos_x
  525. mov ecx, mb_rres8
  526. // mm0 : constant value 32
  527. mov edx, 0x00200020
  528. movd mm0, edx
  529. punpckldq mm0, mm0
  530. // mm5: zero
  531. pxor mm7, mm7
  532. loop8:
  533. movq mm1, MMWORD PTR 0[ecx]
  534. paddw mm1, mm0 // rres + 32
  535. psraw mm1, 6 // (rres + 32) >> 6
  536. movq mm2, MMWORD PTR 0[ebx]
  537. punpcklbw mm2, mm7 // convert pred_row from unsigned char to short
  538. paddsw mm2, mm1 // pred_row + rres_row
  539. packuswb mm2, mm7
  540. movq MMWORD PTR 0[eax], mm2
  541. add eax, 16
  542. add ebx, 16
  543. add ecx, 16
  544. sub esi, 1
  545. jne loop8
  546. emms
  547. }
  548. }
  549. #endif
  550. // benski> unused, left in place for unit testing and if we ever need to port the decoder to non-intel
  551. static void sample_reconstruct8x8(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *mb_rres8, int pos_x, int max_imgpel_value)
  552. {
  553. int i,j;
  554. for( j = 0; j < 8; j++)
  555. {
  556. imgpel *rec_row = mb_rec[j] + pos_x;
  557. const short *rres_row = mb_rres8[j];
  558. const imgpel *pred_row = mb_pred[j] + pos_x;
  559. for( i = 0; i < 8; i++)
  560. rec_row[i] = (imgpel) iClip1(max_imgpel_value, pred_row[i] + rshift_rnd_sf(rres_row[i], DQ_BITS_8));
  561. }
  562. }
  563. /*!
  564. ***********************************************************************
  565. * \brief
  566. * Inverse 8x8 transformation
  567. ***********************************************************************
  568. */
  569. #ifdef _M_IX86
  570. void itrans8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
  571. {
  572. inverse8x8((h264_short_8x8block_row_t *)block);
  573. sample_reconstruct8x8_mmx(mb_rec, mb_pred, block, pos_x);
  574. }
  575. #endif
  576. void itrans8x8_c(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
  577. {
  578. inverse8x8((h264_short_8x8block_row_t *)block);
  579. sample_reconstruct8x8(mb_rec, mb_pred, block, pos_x, 255);
  580. }
  581. void itrans8x8_lossless(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
  582. {
  583. int i,j;
  584. for( j = 0; j < 8; j++)
  585. {
  586. imgpel *rec_row = mb_rec[j] + pos_x;
  587. const short *rres_row = block[j];
  588. const imgpel *pred_row = mb_pred[j] + pos_x;
  589. for( i = 0; i < 8; i++)
  590. rec_row[i] = (imgpel) iClip1(255, (rres_row[i] + (long)pred_row[i]));
  591. }
  592. }