fdctwmt.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810
  1. /****************************************************************************
  2. *
  3. * Module Title : Fdctwmt.c
  4. *
  5. * Description : Forward DCT optimized specifically for Intel P4
  6. * processor
  7. *
  8. * AUTHOR : YaoWu Xu
  9. *
  10. *****************************************************************************
  11. * Revision History
  12. *
  13. * 1.00 YWX 03/11/02 Configuration baseline
  14. *
  15. *****************************************************************************
  16. */
  17. /*******************************************************************************
  18. * Module Constants
  19. *******************************************************************************
  20. */
  21. __declspec(align(16)) static unsigned short TIRY[8];
  22. __declspec(align(16)) static unsigned short WmtIdctConst[8 * 8] =
  23. {
  24. 0, 0, 0, 0, 0, 0, 0, 0,
  25. 64277,64277,64277,64277,64277,64277,64277,64277,
  26. 60547,60547,60547,60547,60547,60547,60547,60547,
  27. 54491,54491,54491,54491,54491,54491,54491,54491,
  28. 46341,46341,46341,46341,46341,46341,46341,46341,
  29. 36410,36410,36410,36410,36410,36410,36410,36410,
  30. 25080,25080,25080,25080,25080,25080,25080,25080,
  31. 12785,12785,12785,12785,12785,12785,12785,12785
  32. };
  33. /**************************************************************************************
  34. *
  35. * Macro: FDct_WMT
  36. *
  37. * Description: The Macro does 1-D IDct on 8 columns.
  38. *
  39. * Input: None
  40. *
  41. * Output: None
  42. *
  43. * Return: None
  44. *
  45. * Special Note: None
  46. *
  47. * Error: None
  48. *
  49. ***************************************************************************************
  50. */
  51. void fdct_WMT(short *InputData, short *OutputData)
  52. {
  53. __asm
  54. {
  55. mov eax, InputData
  56. mov ebx, OutputData
  57. lea edx, WmtIdctConst
  58. #define I(i) [eax + 16 * i ]
  59. #define O(i) [ebx + 16 * i ]
  60. #define C(i) [edx + 16 * i ]
  61. /******************************************************/
  62. /* Do 8x8 Transpose */
  63. /******************************************************/
  64. movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
  65. movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
  66. psllw xmm4, 1
  67. psllw xmm0, 1
  68. movdqa xmm5, xmm4 /* make a copy */
  69. punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
  70. punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
  71. movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
  72. movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
  73. psllw xmm6, 1
  74. psllw xmm0, 1
  75. movdqa xmm7, xmm6 /* make a copy */
  76. punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
  77. punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
  78. movdqa xmm3, xmm4 /* make a copy */
  79. punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
  80. punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
  81. movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
  82. /* Free xmm6 */
  83. movdqa xmm6, xmm5 /* make a copy */
  84. punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
  85. punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
  86. movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
  87. /* Free xmm7 */
  88. movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
  89. psllw xmm0, 1
  90. psllw xmm1, 1
  91. movdqa xmm7, xmm0 /* make a copy */
  92. punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
  93. punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
  94. /* Free xmm1 */
  95. movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
  96. movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
  97. psllw xmm2, 1
  98. psllw xmm3, 1
  99. movdqa xmm1, xmm2 /* make a copy */
  100. punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
  101. punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
  102. movdqa xmm3, xmm0 /* make a copy */
  103. punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
  104. punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
  105. /* Free xmm2 */
  106. movdqa xmm2, xmm7 /* make a copy */
  107. punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
  108. punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
  109. movdqa xmm1, xmm0 /* make a copy */
  110. punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
  111. punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
  112. movdqa I(0), xmm0 /* save I(0) */
  113. movdqa I(1), xmm1 /* save I(1) */
  114. movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
  115. movdqa xmm1, xmm3 /* make a copy */
  116. punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
  117. punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
  118. movdqa xmm4, xmm2 /* make a copy */
  119. punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
  120. punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
  121. movdqa I(2), xmm1 /* save I(2) */
  122. movdqa I(3), xmm3 /* save I(3) */
  123. movdqa I(4), xmm4 /* save I(4) */
  124. movdqa I(5), xmm2 /* save I(5) */
  125. movdqa xmm5, xmm7 /* make a copy */
  126. punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
  127. punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
  128. movdqa I(6), xmm5 /* save I(6) */
  129. movdqa I(7), xmm7 /* save I(7) */
  130. /******************************************************/
  131. /* Done with transpose - Let's do the forward DCT */
  132. /******************************************************/
  133. movdqa xmm0, I(0) /* xmm0 = ip0 */
  134. movdqa xmm1, I(1) /* xmm1 = ip1 */
  135. movdqa xmm2, I(3) /* xmm2 = ip3 */
  136. movdqa xmm3, I(5) /* xmm3 = ip5 */
  137. movdqa xmm4, xmm0 /* xmm4 = ip0 */
  138. movdqa xmm5, xmm1 /* xmm5 = ip1 */
  139. movdqa xmm6, xmm2 /* xmm6 = ip3 */
  140. movdqa xmm7, xmm3 /* xmm7 = ip5 */
  141. paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
  142. paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
  143. paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
  144. paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
  145. psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
  146. psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
  147. psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
  148. paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
  149. psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
  150. paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
  151. psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
  152. movdqa TIRY, xmm0 /* save is07-is34 */
  153. paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
  154. paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
  155. psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
  156. psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
  157. paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
  158. paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
  159. /*---------------------------------------------------------*/
  160. /* op0 and op4
  161. /*---------------------------------------------------------*/
  162. psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
  163. paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
  164. movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
  165. paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
  166. pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
  167. paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
  168. psrlw xmm2, 15
  169. paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
  170. movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
  171. movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
  172. movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
  173. pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
  174. psrlw xmm2, 15
  175. paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
  176. paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
  177. movdqa O(0), xmm3 /* save op0 */
  178. /*---------------------------------------------------------*/
  179. /* op2 and op6
  180. /*---------------------------------------------------------*/
  181. movdqa xmm3, TIRY /* xmm3 = irot_input_y */
  182. pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
  183. movdqa xmm2, TIRY /* xmm2 = irot_input_y */
  184. movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
  185. psrlw xmm2, 15
  186. paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
  187. paddw xmm3, xmm2 /* Truncated */
  188. movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
  189. movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
  190. pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
  191. psrlw xmm2, 15
  192. paddw xmm0, xmm2 /* Truncated */
  193. paddsw xmm3, xmm0 /* op[2] */
  194. movdqa O(2), xmm3 /* save op[2] */
  195. movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
  196. movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
  197. pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
  198. psrlw xmm2, 15
  199. movdqa xmm3, TIRY /* xmm3 = irot_input_y */
  200. paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
  201. paddw xmm5, xmm2 /* Truncated */
  202. movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
  203. pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
  204. psrlw xmm2, 15
  205. paddw xmm3, xmm2 /* Truncated */
  206. psubsw xmm3, xmm5 /* xmm3 = op[6] */
  207. movdqa O(6), xmm3
  208. /*-----------------------------------------------------------------------*/
  209. /* icommon_product1, icommon_product2 */
  210. /*-----------------------------------------------------------------------*/
  211. movdqa xmm0, C(4) /* xmm0 = xC4s4 */
  212. movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
  213. movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
  214. pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
  215. psrlw xmm2, 15
  216. paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
  217. paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
  218. movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
  219. movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
  220. pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
  221. psrlw xmm2, 15 /* For trucation */
  222. paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
  223. paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
  224. /*---------------------------------------------------------*/
  225. pxor xmm0, xmm0 /* Clear xmm0 */
  226. psubsw xmm0, xmm6 /* xmm0 = - id34 */
  227. psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
  228. paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
  229. paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
  230. psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
  231. paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
  232. paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
  233. /*---------------------------------------------------------*/
  234. /* op1 and op7
  235. /*---------------------------------------------------------*/
  236. movdqa xmm7, C(1) /* xC1S7 */
  237. movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
  238. movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
  239. pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
  240. movdqa xmm7, C(7) /* xC7S1 */
  241. psrlw xmm2, 15 /* for trucation */
  242. paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
  243. paddw xmm1, xmm2 /* Trucated */
  244. pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
  245. paddw xmm3, xmm2 /* Truncated */
  246. movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
  247. movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
  248. movdqa xmm7, C(1) /* xC1S7 */
  249. pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
  250. movdqa xmm7, C(7) /* xC7S1 */
  251. psrlw xmm2, 15 /* for trucation */
  252. paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
  253. paddw xmm0, xmm2 /* Truncated */
  254. pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
  255. paddw xmm5, xmm2 /* Truncated */
  256. psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
  257. paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
  258. movdqa O(1), xmm1
  259. movdqa O(7), xmm3
  260. /*---------------------------------------------------------*/
  261. /* op3 and op5
  262. /*---------------------------------------------------------*/
  263. movdqa xmm0, C(3) /* xC3S5 */
  264. movdqa xmm1, C(5) /* xC5S3 */
  265. movdqa xmm5,xmm6 /* irot_input_x */
  266. movdqa xmm7,xmm6 /* irot_input_x */
  267. movdqa xmm2,xmm4 /* irot_input_y */
  268. movdqa xmm3,xmm4 /* irot_input_y */
  269. pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
  270. pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
  271. psrlw xmm2,15 /* for trucation */
  272. psrlw xmm5,15 /* for trucation */
  273. paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
  274. paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
  275. paddw xmm4,xmm2 /* Truncated */
  276. paddw xmm6,xmm5 /* Truncated */
  277. psubsw xmm4,xmm6 /* op [3] */
  278. movdqa O(3),xmm4 /* Save Op[3] */
  279. movdqa xmm4,xmm3 /* irot_input_y */
  280. movdqa xmm6,xmm7 /* irot_input_x */
  281. pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
  282. pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
  283. paddw xmm4,xmm2 /* Trucated */
  284. paddw xmm6,xmm5 /* Trucated */
  285. paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
  286. paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
  287. paddw xmm3,xmm7 /* Op[5] */
  288. movdqa O(5),xmm3 /* Save Op[5] */
  289. /*---------------------------------------------------------*/
  290. /* End of 8 1-D FDCT */
  291. /*---------------------------------------------------------*/
  292. #undef I
  293. #undef O
  294. #define I(i) [ebx + 16 * i ]
  295. #define O(i) [ebx + 16 * i ]
  296. /******************************************************/
  297. /* Do 8x8 Transpose */
  298. /******************************************************/
  299. movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
  300. movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
  301. movdqa xmm5, xmm4 /* make a copy */
  302. punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
  303. punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
  304. movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
  305. movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
  306. movdqa xmm7, xmm6 /* make a copy */
  307. punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
  308. punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
  309. movdqa xmm3, xmm4 /* make a copy */
  310. punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
  311. punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
  312. movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
  313. /* Free xmm6 */
  314. movdqa xmm6, xmm5 /* make a copy */
  315. punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
  316. punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
  317. movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
  318. /* Free xmm7 */
  319. movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
  320. movdqa xmm7, xmm0 /* make a copy */
  321. punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
  322. punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
  323. /* Free xmm1 */
  324. movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
  325. movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
  326. movdqa xmm1, xmm2 /* make a copy */
  327. punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
  328. punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
  329. movdqa xmm3, xmm0 /* make a copy */
  330. punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
  331. punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
  332. /* Free xmm2 */
  333. movdqa xmm2, xmm7 /* make a copy */
  334. punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
  335. punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
  336. movdqa xmm1, xmm0 /* make a copy */
  337. punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
  338. punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
  339. movdqa I(0), xmm0 /* save I(0) */
  340. movdqa I(1), xmm1 /* save I(1) */
  341. movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
  342. movdqa xmm1, xmm3 /* make a copy */
  343. punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
  344. punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
  345. movdqa xmm4, xmm2 /* make a copy */
  346. punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
  347. punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
  348. movdqa I(2), xmm1 /* save I(2) */
  349. movdqa I(3), xmm3 /* save I(3) */
  350. movdqa I(4), xmm4 /* save I(4) */
  351. movdqa I(5), xmm2 /* save I(5) */
  352. movdqa xmm5, xmm7 /* make a copy */
  353. punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
  354. punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
  355. movdqa I(6), xmm5 /* save I(6) */
  356. movdqa I(7), xmm7 /* save I(7) */
  357. /******************************************************/
  358. /* Done with transpose - Let's do the forward DCT */
  359. /******************************************************/
  360. movdqa xmm0, I(0) /* xmm0 = ip0 */
  361. movdqa xmm1, I(1) /* xmm1 = ip1 */
  362. movdqa xmm2, I(3) /* xmm2 = ip3 */
  363. movdqa xmm3, I(5) /* xmm3 = ip5 */
  364. movdqa xmm4, xmm0 /* xmm4 = ip0 */
  365. movdqa xmm5, xmm1 /* xmm5 = ip1 */
  366. movdqa xmm6, xmm2 /* xmm6 = ip3 */
  367. movdqa xmm7, xmm3 /* xmm7 = ip5 */
  368. paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
  369. paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
  370. paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
  371. paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
  372. psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
  373. psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
  374. psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
  375. paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
  376. psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
  377. paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
  378. psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
  379. movdqa TIRY, xmm0 /* save is07-is34 */
  380. paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
  381. paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
  382. psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
  383. psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
  384. paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
  385. paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
  386. /*---------------------------------------------------------*/
  387. /* op0 and op4
  388. /*---------------------------------------------------------*/
  389. #if 0
  390. movdqa xmm0, xmm2 /* xmm0 =xmm2= is0734 */
  391. pmulhw xmm2, C(4) /* xC4S4 * is0734 - is0734 */
  392. paddw xmm2, xmm0 /* XC4S4 * is0734 */
  393. movdqa xmm0, xmm3 /* xmm0 =xmm3= is1256 */
  394. pmulhw xmm3, C(4) /* xC4S4 * is1256 - is1256 */
  395. paddw xmm3, xmm0 /* xC4S4 * is1256 */
  396. movdqa xmm0, xmm2
  397. paddsw xmm2, xmm3 /* xC4S4 * ( is0734 +is1256 ) */
  398. psubsw xmm0, xmm3 /* xC4S4 * ( is0734 -is1256 ) */
  399. movdqa xmm3, xmm2
  400. psrlw xmm2, 15
  401. paddsw xmm3, xmm2
  402. movdqa xmm2, xmm0
  403. movdqa O(0), xmm3
  404. psrlw xmm0, 15
  405. paddsw xmm2, xmm0
  406. movdqa O(4), xmm2
  407. #else
  408. psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
  409. paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
  410. movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
  411. paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
  412. pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
  413. paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
  414. psrlw xmm2, 15
  415. paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
  416. movdqa xmm2, xmm0
  417. psrlw xmm0, 15
  418. paddw xmm0, xmm2
  419. psraw xmm0, 1
  420. movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
  421. movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
  422. movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
  423. pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
  424. psrlw xmm2, 15
  425. paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
  426. paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
  427. movdqa xmm2, xmm3
  428. psrlw xmm3, 15
  429. paddw xmm3, xmm2
  430. psraw xmm3, 1
  431. movdqa O(0), xmm3 /* save op0 */
  432. #endif
  433. /*---------------------------------------------------------*/
  434. /* op2 and op6
  435. /*---------------------------------------------------------*/
  436. movdqa xmm3, TIRY /* xmm3 = irot_input_y */
  437. pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
  438. movdqa xmm2, TIRY /* xmm2 = irot_input_y */
  439. movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
  440. psrlw xmm2, 15
  441. paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
  442. paddw xmm3, xmm2 /* Truncated */
  443. movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
  444. movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
  445. pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
  446. psrlw xmm2, 15
  447. paddw xmm0, xmm2 /* Truncated */
  448. paddsw xmm3, xmm0 /* op[2] */
  449. movdqa xmm0, xmm3
  450. psrlw xmm3, 15
  451. paddw xmm3, xmm0
  452. psraw xmm3, 1
  453. movdqa O(2), xmm3 /* save op[2] */
  454. movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
  455. movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
  456. pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
  457. psrlw xmm2, 15
  458. movdqa xmm3, TIRY /* xmm3 = irot_input_y */
  459. paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
  460. paddw xmm5, xmm2 /* Truncated */
  461. movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
  462. pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
  463. psrlw xmm2, 15
  464. paddw xmm3, xmm2 /* Truncated */
  465. psubsw xmm3, xmm5 /* xmm3 = op[6] */
  466. movdqa xmm5, xmm3
  467. psrlw xmm3, 15
  468. paddw xmm3, xmm5
  469. psraw xmm3, 1
  470. movdqa O(6), xmm3
  471. /*-----------------------------------------------------------------------*/
  472. /* icommon_product1, icommon_product2 */
  473. /*-----------------------------------------------------------------------*/
  474. movdqa xmm0, C(4) /* xmm0 = xC4s4 */
  475. movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
  476. movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
  477. pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
  478. psrlw xmm2, 15
  479. paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
  480. paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
  481. movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
  482. movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
  483. pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
  484. psrlw xmm2, 15 /* For trucation */
  485. paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
  486. paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
  487. /*---------------------------------------------------------*/
  488. pxor xmm0, xmm0 /* Clear xmm0 */
  489. psubsw xmm0, xmm6 /* xmm0 = - id34 */
  490. psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
  491. paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
  492. paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
  493. psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
  494. paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
  495. paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
  496. /*---------------------------------------------------------*/
  497. /* op1 and op7
  498. /*---------------------------------------------------------*/
  499. movdqa xmm7, C(1) /* xC1S7 */
  500. movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
  501. movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
  502. pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
  503. movdqa xmm7, C(7) /* xC7S1 */
  504. psrlw xmm2, 15 /* for trucation */
  505. paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
  506. paddw xmm1, xmm2 /* Trucated */
  507. pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
  508. paddw xmm3, xmm2 /* Truncated */
  509. movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
  510. movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
  511. movdqa xmm7, C(1) /* xC1S7 */
  512. pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
  513. movdqa xmm7, C(7) /* xC7S1 */
  514. psrlw xmm2, 15 /* for trucation */
  515. paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
  516. paddw xmm0, xmm2 /* Truncated */
  517. pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
  518. paddw xmm5, xmm2 /* Truncated */
  519. psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
  520. paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
  521. movdqa xmm5, xmm1
  522. movdqa xmm0, xmm3
  523. psrlw xmm1, 15
  524. psrlw xmm3, 15
  525. paddw xmm1, xmm5
  526. paddw xmm3, xmm0
  527. psraw xmm1, 1
  528. psraw xmm3, 1
  529. movdqa O(1), xmm1
  530. movdqa O(7), xmm3
  531. /*---------------------------------------------------------*/
  532. /* op3 and op5
  533. /*---------------------------------------------------------*/
  534. movdqa xmm0, C(3) /* xC3S5 */
  535. movdqa xmm1, C(5) /* xC5S3 */
  536. movdqa xmm5,xmm6 /* irot_input_x */
  537. movdqa xmm7,xmm6 /* irot_input_x */
  538. movdqa xmm2,xmm4 /* irot_input_y */
  539. movdqa xmm3,xmm4 /* irot_input_y */
  540. pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
  541. pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
  542. psrlw xmm2,15 /* for trucation */
  543. psrlw xmm5,15 /* for trucation */
  544. paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
  545. paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
  546. paddw xmm4,xmm2 /* Truncated */
  547. paddw xmm6,xmm5 /* Truncated */
  548. psubsw xmm4,xmm6 /* op [3] */
  549. movdqa xmm6,xmm4
  550. psrlw xmm4,15
  551. paddw xmm4,xmm6
  552. psraw xmm4,1
  553. movdqa O(3),xmm4 /* Save Op[3] */
  554. movdqa xmm4,xmm3 /* irot_input_y */
  555. movdqa xmm6,xmm7 /* irot_input_x */
  556. pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
  557. pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
  558. paddw xmm4,xmm2 /* Trucated */
  559. paddw xmm6,xmm5 /* Trucated */
  560. paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
  561. paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
  562. paddw xmm3,xmm7 /* Op[5] */
  563. movdqa xmm7,xmm3
  564. psrlw xmm3,15
  565. paddw xmm3,xmm7
  566. psraw xmm3,1
  567. movdqa O(5),xmm3 /* Save Op[5] */
  568. /*---------------------------------------------------------*/
  569. /* End of 8 1-D FDCT */
  570. /*---------------------------------------------------------*/
  571. }/* end of _asm code section */
  572. }