fdctmmx.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398
  1. /****************************************************************************
  2. *
  3. * Module Title : fdctmmx.c
  4. *
  5. * Description : Forward DCT optimized specifically for mmx or compatible
  6. * processor
  7. *
  8. * AUTHOR : Yaowu Xu
  9. *
  10. *****************************************************************************
  11. * Revision History
  12. *
  13. * 1.00 YWX 07/11/11 Configuration baseline
  14. *
  15. *****************************************************************************
  16. */
  17. /*******************************************************************************
  18. * Module Constants
  19. *******************************************************************************
  20. */
  21. __declspec(align(16)) static unsigned short TIRY[8];
  22. __declspec(align(16)) static unsigned short MmxIdctConst[8 * 4] =
  23. {
  24. 0, 0, 0, 0,
  25. 64277,64277,64277,64277,
  26. 60547,60547,60547,60547,
  27. 54491,54491,54491,54491,
  28. 46341,46341,46341,46341,
  29. 36410,36410,36410,36410,
  30. 25080,25080,25080,25080,
  31. 12785,12785,12785,12785
  32. };
  33. /**************************************************************************************
  34. *
  35. * Macro: fdct_MMX
  36. *
  37. * Description: The Macro does 1-D IDct on 8 columns.
  38. *
  39. * Input: None
  40. *
  41. * Output: None
  42. *
  43. * Return: None
  44. *
  45. * Special Note: The inputdata is limited to 9 bits [-256, 255]
  46. *
  47. * Error: None
  48. *
  49. ***************************************************************************************
  50. */
  51. void fdct_MMX(short *InputData, short *OutputData)
  52. {
  53. __asm
  54. {
  55. mov eax, InputData
  56. mov ebx, OutputData
  57. lea ecx, [eax+8]
  58. lea edi, [ebx+8]
  59. lea edx, MmxIdctConst
  60. #define IL(i) [eax + 16 * i]
  61. #define IH(i) [ecx + 16 * i]
  62. #define OL(i) [ebx + 16 * i]
  63. #define OH(i) [edi + 16 * i]
  64. #define C(i) [edx + 8 * i]
  65. /******************************************************/
  66. /* Do 4x8 Transpose is done through 2 4x4 Transpose */
  67. /******************************************************/
  68. movq mm4, IH(0) /* mm4=e3e2e1e0 */
  69. movq mm0, IH(1) /* mm4=f3f2f1f0 */
  70. psllw mm4, 1 /* up precision */
  71. psllw mm0, 1 /* up precision */
  72. movq mm5, mm4 /* make a copy */
  73. punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
  74. punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
  75. movq mm6, IH(2) /* mm6=g3g2g1g0 */
  76. movq mm0, IH(3) /* mm0=h3h2h1h0 */
  77. psllw mm6, 1 /* up precision */
  78. psllw mm0, 1 /* up precision */
  79. movq mm7, mm6 /* mm7=g3g2g1g0 */
  80. punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
  81. punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
  82. movq mm3, mm4 /* mm4=f1e1f0e0 */
  83. punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
  84. punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
  85. movq mm6, mm5 /* mm5=f3e3f2e2 */
  86. punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
  87. movq IH(0), mm4 /* saveh0g0f0e0 */
  88. punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
  89. movq IH(2), mm5 /* saveh2g2f2e2 */
  90. movq IH(3), mm6 /* saveh3g3f3e3 */
  91. /*----------------------------------------------------*/
  92. /* mm3 in use for IH(1) */
  93. /*----------------------------------------------------*/
  94. movq mm4, IL(0) /* mm4=a3a2a1a0 */
  95. movq mm0, IL(1) /* mm0=b3b2b1b0 */
  96. psllw mm4, 1 /* up precision */
  97. psllw mm0, 1 /* up precision */
  98. movq mm5, mm4 /* mm5=a3a2a1a0 */
  99. punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
  100. punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
  101. movq mm6, IL(2) /* mm6=c3c2c1c0 */
  102. movq mm0, IL(3) /* mm0=d3d2d1d0 */
  103. psllw mm6, 1 /* up precision */
  104. psllw mm0, 1 /* up precision */
  105. movq mm7, mm6 /* mm7=c3c2c1c0 */
  106. punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
  107. punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
  108. movq mm1, mm4 /* mm4=b1a1b0a0 */
  109. punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
  110. punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
  111. movq mm2, mm5 /* mm5=b3a3b2a2 */
  112. punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
  113. punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
  114. movq IL(2), mm5 /* saved2c2b2a2 */
  115. /*----------------------------------------------------*/
  116. /* mm1 in use for IL(1) */
  117. /* mm2 in use for IL(3) */
  118. /* mm3 in use for IH(1) */
  119. /* mm4 in use for IH(0) */
  120. /*----------------------------------------------------*/
  121. /******************************************************/
  122. /* Let's do the 4x8 forward DCT */
  123. /******************************************************/
  124. movq mm0, mm4 /* mm4 = ip0 */
  125. movq mm5, mm1 /* mm5 = ip1 */
  126. movq mm6, mm2 /* mm6 = ip3 */
  127. movq mm7, mm3 /* mm7 = ip5 */
  128. paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
  129. paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
  130. paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
  131. paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
  132. psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
  133. psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
  134. psubsw mm0, mm2 /* mm0 = is07 - is34 */
  135. paddsw mm2, mm2 /* mm2 = is34 * 2 */
  136. psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
  137. paddsw mm2, mm0 /* mm2 = is07 + is34 */
  138. psubsw mm1, mm3 /* mm1 = is12 - is56 */
  139. movq TIRY, mm0 /* save is07-is34 */
  140. paddsw mm3, mm3 /* mm3 = is56 * 2 */
  141. paddsw mm3, mm1 /* mm3 = is12 + is56 */
  142. psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
  143. psubsw mm5, mm7 /* mm5 = id12 - id56 */
  144. paddsw mm7, mm7 /* mm7 = id56 * 2 */
  145. paddsw mm7, mm5 /* mm7 = id12 + id56 */
  146. /*---------------------------------------------------------*/
  147. /* op0 and op4
  148. /*---------------------------------------------------------*/
  149. psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
  150. paddsw mm3, mm3 /* mm3 = is1256 * 2 */
  151. movq mm0, mm2 /* mm0 = is0734 - is1256 */
  152. paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
  153. pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
  154. paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
  155. psrlw mm2, 15
  156. paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
  157. movq mm2, mm3 /* mm2 = is0734 + is1256 */
  158. movq OH(0), mm0 /* op4, now mm0,mm2 are free */
  159. movq mm0, mm3 /* mm0 = is0734 + is1256 */
  160. pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
  161. psrlw mm2, 15
  162. paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
  163. paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
  164. movq OL(0), mm3 /* save op0 */
  165. /*---------------------------------------------------------*/
  166. /* op2 and op6
  167. /*---------------------------------------------------------*/
  168. movq mm3, TIRY /* mm3 = irot_input_y */
  169. pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
  170. movq mm2, TIRY /* mm2 = irot_input_y */
  171. movq mm0, mm2 /* mm0 = irot_input_y */
  172. psrlw mm2, 15
  173. paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
  174. paddw mm3, mm2 /* Truncated */
  175. movq mm0, mm5 /* mm0 = id12 - id56 */
  176. movq mm2, mm5 /* mm2 = id12 - id56 */
  177. pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
  178. psrlw mm2, 15
  179. paddw mm0, mm2 /* Truncated */
  180. paddsw mm3, mm0 /* op[2] */
  181. movq OL(2), mm3 /* save op[2] */
  182. movq mm0, mm5 /* mm0 = id12 - id56 */
  183. movq mm2, mm5 /* mm0 = id12 - id56 */
  184. pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
  185. psrlw mm2, 15
  186. movq mm3, TIRY /* mm3 = irot_input_y */
  187. paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
  188. paddw mm5, mm2 /* Truncated */
  189. movq mm2, mm3 /* mm2 = irot_input_y */
  190. pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
  191. psrlw mm2, 15
  192. paddw mm3, mm2 /* Truncated */
  193. psubsw mm3, mm5 /* mm3 = op[6] */
  194. movq OH(2), mm3
  195. /*-----------------------------------------------------------------------*/
  196. /* icommon_product1, icommon_product2 */
  197. /*-----------------------------------------------------------------------*/
  198. movq mm0, C(4) /* mm0 = xC4s4 */
  199. movq mm2, mm1 /* mm2 = is12 - is56 */
  200. movq mm3, mm1 /* mm3 = is12 - is56 */
  201. pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
  202. psrlw mm2, 15
  203. paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
  204. paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
  205. movq mm2, mm7 /* mm2 = id12 + id56 */
  206. movq mm3, mm7 /* mm3 = id12 + id56 */
  207. pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
  208. psrlw mm2, 15 /* For trucation */
  209. paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
  210. paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
  211. /*---------------------------------------------------------*/
  212. pxor mm0, mm0 /* Clear mm0 */
  213. psubsw mm0, mm6 /* mm0 = - id34 */
  214. psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
  215. paddsw mm6, mm6 /* mm6 = id34 * 2 */
  216. paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
  217. psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
  218. paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
  219. paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
  220. /*---------------------------------------------------------*/
  221. /* op1 and op7
  222. /*---------------------------------------------------------*/
  223. movq mm7, C(1) /* xC1S7 */
  224. movq mm2, mm1 /* mm2 = irot_input_x */
  225. movq mm3, mm1; /* mm3 = irot_input_x */
  226. pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
  227. movq mm7, C(7) /* xC7S1 */
  228. psrlw mm2, 15 /* for trucation */
  229. paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
  230. paddw mm1, mm2 /* Trucated */
  231. pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
  232. paddw mm3, mm2 /* Truncated */
  233. movq mm5, mm0 /* mm5 = irot_input_y */
  234. movq mm2, mm0 /* mm2 = irot_input_y */
  235. movq mm7, C(1) /* xC1S7 */
  236. pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
  237. movq mm7, C(7) /* xC7S1 */
  238. psrlw mm2, 15 /* for trucation */
  239. paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
  240. paddw mm0, mm2 /* Truncated */
  241. pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
  242. paddw mm5, mm2 /* Truncated */
  243. psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
  244. paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
  245. movq OL(1), mm1
  246. movq OH(3), mm3
  247. /*---------------------------------------------------------*/
  248. /* op3 and op5
  249. /*---------------------------------------------------------*/
  250. movq mm0, C(3) /* xC3S5 */
  251. movq mm1, C(5) /* xC5S3 */
  252. movq mm5,mm6 /* irot_input_x */
  253. movq mm7,mm6 /* irot_input_x */
  254. movq mm2,mm4 /* irot_input_y */
  255. movq mm3,mm4 /* irot_input_y */
  256. pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
  257. pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
  258. psrlw mm2,15 /* for trucation */
  259. psrlw mm5,15 /* for trucation */
  260. paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
  261. paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
  262. paddw mm4,mm2 /* Truncated */
  263. paddw mm6,mm5 /* Truncated */
  264. psubsw mm4,mm6 /* op [3] */
  265. movq OL(3),mm4 /* Save Op[3] */
  266. movq mm4,mm3 /* irot_input_y */
  267. movq mm6,mm7 /* irot_input_x */
  268. pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
  269. pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
  270. paddw mm4,mm2 /* Trucated */
  271. paddw mm6,mm5 /* Trucated */
  272. paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
  273. paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
  274. paddw mm3,mm7 /* Op[5] */
  275. movq OH(1),mm3 /* Save Op[5] */
  276. /*---------------------------------------------------------*/
  277. /* End of 4x8 1-D FDCT */
  278. /*---------------------------------------------------------*/
  279. /******************************************************/
  280. /* Do 4x8 Transpose is done through 2 4x4 Transpose */
  281. /******************************************************/
  282. lea eax, [eax+64]
  283. lea ecx, [ecx+64]
  284. lea ebx, [ebx+64]
  285. lea edi, [edi+64]
  286. movq mm4, IH(0) /* mm4=e3e2e1e0 */
  287. movq mm0, IH(1) /* mm4=f3f2f1f0 */
  288. psllw mm4, 1 /* up precision */
  289. psllw mm0, 1 /* up precision */
  290. movq mm5, mm4 /* make a copy */
  291. punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
  292. punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
  293. movq mm6, IH(2) /* mm6=g3g2g1g0 */
  294. movq mm0, IH(3) /* mm0=h3h2h1h0 */
  295. psllw mm6, 1 /* up precision */
  296. psllw mm0, 1 /* up precision */
  297. movq mm7, mm6 /* mm7=g3g2g1g0 */
  298. punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
  299. punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
  300. movq mm3, mm4 /* mm4=f1e1f0e0 */
  301. punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
  302. punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
  303. movq mm6, mm5 /* mm5=f3e3f2e2 */
  304. punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
  305. movq IH(0), mm4 /* saveh0g0f0e0 */
  306. punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
  307. movq IH(2), mm5 /* saveh2g2f2e2 */
  308. movq IH(3), mm6 /* saveh3g3f3e3 */
  309. /*----------------------------------------------------*/
  310. /* mm3 in use for IH(1) */
  311. /*----------------------------------------------------*/
  312. movq mm4, IL(0) /* mm4=a3a2a1a0 */
  313. movq mm0, IL(1) /* mm0=b3b2b1b0 */
  314. psllw mm4, 1 /* up precision */
  315. psllw mm0, 1 /* up precision */
  316. movq mm5, mm4 /* mm5=a3a2a1a0 */
  317. punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
  318. punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
  319. movq mm6, IL(2) /* mm6=c3c2c1c0 */
  320. movq mm0, IL(3) /* mm0=d3d2d1d0 */
  321. psllw mm6, 1 /* up precision */
  322. psllw mm0, 1 /* up precision */
  323. movq mm7, mm6 /* mm7=c3c2c1c0 */
  324. punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
  325. punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
  326. movq mm1, mm4 /* mm4=b1a1b0a0 */
  327. punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
  328. punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
  329. movq mm2, mm5 /* mm5=b3a3b2a2 */
  330. punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
  331. punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
  332. movq IL(2), mm5 /* saved2c2b2a2 */
  333. /*----------------------------------------------------*/
  334. /* mm1 in use for IL(1) */
  335. /* mm2 in use for IL(3) */
  336. /* mm3 in use for IH(1) */
  337. /* mm4 in use for IH(0) */
  338. /*----------------------------------------------------*/
  339. /******************************************************/
  340. /* Let's do the 4x8 forward DCT */
  341. /******************************************************/
  342. movq mm0, mm4 /* mm4 = ip0 */
  343. movq mm5, mm1 /* mm5 = ip1 */
  344. movq mm6, mm2 /* mm6 = ip3 */
  345. movq mm7, mm3 /* mm7 = ip5 */
  346. paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
  347. paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
  348. paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
  349. paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
  350. psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
  351. psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
  352. psubsw mm0, mm2 /* mm0 = is07 - is34 */
  353. paddsw mm2, mm2 /* mm2 = is34 * 2 */
  354. psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
  355. paddsw mm2, mm0 /* mm2 = is07 + is34 */
  356. psubsw mm1, mm3 /* mm1 = is12 - is56 */
  357. movq TIRY, mm0 /* save is07-is34 */
  358. paddsw mm3, mm3 /* mm3 = is56 * 2 */
  359. paddsw mm3, mm1 /* mm3 = is12 + is56 */
  360. psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
  361. psubsw mm5, mm7 /* mm5 = id12 - id56 */
  362. paddsw mm7, mm7 /* mm7 = id56 * 2 */
  363. paddsw mm7, mm5 /* mm7 = id12 + id56 */
  364. /*---------------------------------------------------------*/
  365. /* op0 and op4
  366. /*---------------------------------------------------------*/
  367. psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
  368. paddsw mm3, mm3 /* mm3 = is1256 * 2 */
  369. movq mm0, mm2 /* mm0 = is0734 - is1256 */
  370. paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
  371. pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
  372. paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
  373. psrlw mm2, 15
  374. paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
  375. movq mm2, mm3 /* mm2 = is0734 + is1256 */
  376. movq OH(0), mm0 /* op4, now mm0,mm2 are free */
  377. movq mm0, mm3 /* mm0 = is0734 + is1256 */
  378. pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
  379. psrlw mm2, 15
  380. paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
  381. paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
  382. movq OL(0), mm3 /* save op0 */
  383. /*---------------------------------------------------------*/
  384. /* op2 and op6
  385. /*---------------------------------------------------------*/
  386. movq mm3, TIRY /* mm3 = irot_input_y */
  387. pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
  388. movq mm2, TIRY /* mm2 = irot_input_y */
  389. movq mm0, mm2 /* mm0 = irot_input_y */
  390. psrlw mm2, 15
  391. paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
  392. paddw mm3, mm2 /* Truncated */
  393. movq mm0, mm5 /* mm0 = id12 - id56 */
  394. movq mm2, mm5 /* mm2 = id12 - id56 */
  395. pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
  396. psrlw mm2, 15
  397. paddw mm0, mm2 /* Truncated */
  398. paddsw mm3, mm0 /* op[2] */
  399. movq OL(2), mm3 /* save op[2] */
  400. movq mm0, mm5 /* mm0 = id12 - id56 */
  401. movq mm2, mm5 /* mm0 = id12 - id56 */
  402. pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
  403. psrlw mm2, 15
  404. movq mm3, TIRY /* mm3 = irot_input_y */
  405. paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
  406. paddw mm5, mm2 /* Truncated */
  407. movq mm2, mm3 /* mm2 = irot_input_y */
  408. pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
  409. psrlw mm2, 15
  410. paddw mm3, mm2 /* Truncated */
  411. psubsw mm3, mm5 /* mm3 = op[6] */
  412. movq OH(2), mm3
  413. /*-----------------------------------------------------------------------*/
  414. /* icommon_product1, icommon_product2 */
  415. /*-----------------------------------------------------------------------*/
  416. movq mm0, C(4) /* mm0 = xC4s4 */
  417. movq mm2, mm1 /* mm2 = is12 - is56 */
  418. movq mm3, mm1 /* mm3 = is12 - is56 */
  419. pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
  420. psrlw mm2, 15
  421. paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
  422. paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
  423. movq mm2, mm7 /* mm2 = id12 + id56 */
  424. movq mm3, mm7 /* mm3 = id12 + id56 */
  425. pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
  426. psrlw mm2, 15 /* For trucation */
  427. paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
  428. paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
  429. /*---------------------------------------------------------*/
  430. pxor mm0, mm0 /* Clear mm0 */
  431. psubsw mm0, mm6 /* mm0 = - id34 */
  432. psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
  433. paddsw mm6, mm6 /* mm6 = id34 * 2 */
  434. paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
  435. psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
  436. paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
  437. paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
  438. /*---------------------------------------------------------*/
  439. /* op1 and op7
  440. /*---------------------------------------------------------*/
  441. movq mm7, C(1) /* xC1S7 */
  442. movq mm2, mm1 /* mm2 = irot_input_x */
  443. movq mm3, mm1; /* mm3 = irot_input_x */
  444. pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
  445. movq mm7, C(7) /* xC7S1 */
  446. psrlw mm2, 15 /* for trucation */
  447. paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
  448. paddw mm1, mm2 /* Trucated */
  449. pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
  450. paddw mm3, mm2 /* Truncated */
  451. movq mm5, mm0 /* mm5 = irot_input_y */
  452. movq mm2, mm0 /* mm2 = irot_input_y */
  453. movq mm7, C(1) /* xC1S7 */
  454. pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
  455. movq mm7, C(7) /* xC7S1 */
  456. psrlw mm2, 15 /* for trucation */
  457. paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
  458. paddw mm0, mm2 /* Truncated */
  459. pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
  460. paddw mm5, mm2 /* Truncated */
  461. psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
  462. paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
  463. movq OL(1), mm1
  464. movq OH(3), mm3
  465. /*---------------------------------------------------------*/
  466. /* op3 and op5
  467. /*---------------------------------------------------------*/
  468. movq mm0, C(3) /* xC3S5 */
  469. movq mm1, C(5) /* xC5S3 */
  470. movq mm5,mm6 /* irot_input_x */
  471. movq mm7,mm6 /* irot_input_x */
  472. movq mm2,mm4 /* irot_input_y */
  473. movq mm3,mm4 /* irot_input_y */
  474. pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
  475. pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
  476. psrlw mm2,15 /* for trucation */
  477. psrlw mm5,15 /* for trucation */
  478. paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
  479. paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
  480. paddw mm4,mm2 /* Truncated */
  481. paddw mm6,mm5 /* Truncated */
  482. psubsw mm4,mm6 /* op [3] */
  483. movq OL(3),mm4 /* Save Op[3] */
  484. movq mm4,mm3 /* irot_input_y */
  485. movq mm6,mm7 /* irot_input_x */
  486. pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
  487. pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
  488. paddw mm4,mm2 /* Trucated */
  489. paddw mm6,mm5 /* Trucated */
  490. paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
  491. paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
  492. paddw mm3,mm7 /* Op[5] */
  493. movq OH(1),mm3 /* Save Op[5] */
  494. /*---------------------------------------------------------*/
  495. /* End of Horizontal FDCT */
  496. /*---------------------------------------------------------*/
  497. lea eax, [ebx-64]
  498. lea esi, [edi-64]
  499. #undef IL
  500. #undef IH
  501. #undef OL
  502. #undef OH
  503. #define IL(i) [eax + 16 * i]
  504. #define IH(i) [ebx + 16 * i]
  505. #define OL(i) [eax + 16 * i]
  506. #define OH(i) [ebx + 16 * i]
  507. /******************************************************/
  508. /* Do 4x8 Transpose is done through 2 4x4 Transpose */
  509. /******************************************************/
  510. movq mm4, IH(0) /* mm4=e3e2e1e0 */
  511. movq mm0, IH(1) /* mm4=f3f2f1f0 */
  512. movq mm5, mm4 /* make a copy */
  513. punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
  514. punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
  515. movq mm6, IH(2) /* mm6=g3g2g1g0 */
  516. movq mm0, IH(3) /* mm0=h3h2h1h0 */
  517. movq mm7, mm6 /* mm7=g3g2g1g0 */
  518. punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
  519. punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
  520. movq mm3, mm4 /* mm4=f1e1f0e0 */
  521. punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
  522. punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
  523. movq mm6, mm5 /* mm5=f3e3f2e2 */
  524. punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
  525. movq IH(0), mm4 /* saveh0g0f0e0 */
  526. punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
  527. movq IH(2), mm5 /* saveh2g2f2e2 */
  528. movq IH(3), mm6 /* saveh3g3f3e3 */
  529. /*----------------------------------------------------*/
  530. /* mm3 in use for IH(1) */
  531. /*----------------------------------------------------*/
  532. movq mm4, IL(0) /* mm4=a3a2a1a0 */
  533. movq mm0, IL(1) /* mm0=b3b2b1b0 */
  534. movq mm5, mm4 /* mm5=a3a2a1a0 */
  535. punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
  536. punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
  537. movq mm6, IL(2) /* mm6=c3c2c1c0 */
  538. movq mm0, IL(3) /* mm0=d3d2d1d0 */
  539. movq mm7, mm6 /* mm7=c3c2c1c0 */
  540. punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
  541. punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
  542. movq mm1, mm4 /* mm4=b1a1b0a0 */
  543. punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
  544. punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
  545. movq mm2, mm5 /* mm5=b3a3b2a2 */
  546. punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
  547. punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
  548. movq IL(2), mm5 /* saved2c2b2a2 */
  549. /*----------------------------------------------------*/
  550. /* mm1 in use for IL(1) */
  551. /* mm2 in use for IL(3) */
  552. /* mm3 in use for IH(1) */
  553. /* mm4 in use for IH(0) */
  554. /*----------------------------------------------------*/
  555. /******************************************************/
  556. /* Let's do the 4x8 forward DCT */
  557. /******************************************************/
  558. movq mm0, mm4 /* mm4 = ip0 */
  559. movq mm5, mm1 /* mm5 = ip1 */
  560. movq mm6, mm2 /* mm6 = ip3 */
  561. movq mm7, mm3 /* mm7 = ip5 */
  562. paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
  563. paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
  564. paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
  565. paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
  566. psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
  567. psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
  568. psubsw mm0, mm2 /* mm0 = is07 - is34 */
  569. paddsw mm2, mm2 /* mm2 = is34 * 2 */
  570. psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
  571. paddsw mm2, mm0 /* mm2 = is07 + is34 */
  572. psubsw mm1, mm3 /* mm1 = is12 - is56 */
  573. movq TIRY, mm0 /* save is07-is34 */
  574. paddsw mm3, mm3 /* mm3 = is56 * 2 */
  575. paddsw mm3, mm1 /* mm3 = is12 + is56 */
  576. psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
  577. psubsw mm5, mm7 /* mm5 = id12 - id56 */
  578. paddsw mm7, mm7 /* mm7 = id56 * 2 */
  579. paddsw mm7, mm5 /* mm7 = id12 + id56 */
  580. /*---------------------------------------------------------*/
  581. /* op0 and op4
  582. /*---------------------------------------------------------*/
  583. psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
  584. paddsw mm3, mm3 /* mm3 = is1256 * 2 */
  585. movq mm0, mm2 /* mm0 = is0734 - is1256 */
  586. paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
  587. pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
  588. paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
  589. psrlw mm2, 15
  590. paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
  591. movq mm2, mm0
  592. psrlw mm0, 15
  593. paddw mm0, mm2
  594. psraw mm0, 1
  595. movq OH(0), mm0 /* op4, now mm0,mm2 are free */
  596. movq mm2, mm3 /* mm2 = is0734 + is1256 */
  597. movq mm0, mm3 /* mm0 = is0734 + is1256 */
  598. pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
  599. psrlw mm2, 15
  600. paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
  601. paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
  602. movq mm2, mm3
  603. psrlw mm3, 15
  604. paddw mm3, mm2
  605. psraw mm3, 1
  606. movq OL(0), mm3 /* save op0 */
  607. /*---------------------------------------------------------*/
  608. /* op2 and op6
  609. /*---------------------------------------------------------*/
  610. movq mm3, TIRY /* mm3 = irot_input_y */
  611. pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
  612. movq mm2, TIRY /* mm2 = irot_input_y */
  613. movq mm0, mm2 /* mm0 = irot_input_y */
  614. psrlw mm2, 15
  615. paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
  616. paddw mm3, mm2 /* Truncated */
  617. movq mm0, mm5 /* mm0 = id12 - id56 */
  618. movq mm2, mm5 /* mm2 = id12 - id56 */
  619. pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
  620. psrlw mm2, 15
  621. paddw mm0, mm2 /* Truncated */
  622. paddsw mm3, mm0 /* op[2] */
  623. movq mm0, mm3
  624. psrlw mm3, 15
  625. paddw mm3, mm0
  626. psraw mm3, 1
  627. movq OL(2), mm3 /* save op[2] */
  628. movq mm0, mm5 /* mm0 = id12 - id56 */
  629. movq mm2, mm5 /* mm0 = id12 - id56 */
  630. pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
  631. psrlw mm2, 15
  632. movq mm3, TIRY /* mm3 = irot_input_y */
  633. paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
  634. paddw mm5, mm2 /* Truncated */
  635. movq mm2, mm3 /* mm2 = irot_input_y */
  636. pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
  637. psrlw mm2, 15
  638. paddw mm3, mm2 /* Truncated */
  639. psubsw mm3, mm5 /* mm3 = op[6] */
  640. movq mm5, mm3
  641. psrlw mm3, 15
  642. paddw mm3, mm5
  643. psraw mm3, 1
  644. movq OH(2), mm3
  645. /*-----------------------------------------------------------------------*/
  646. /* icommon_product1, icommon_product2 */
  647. /*-----------------------------------------------------------------------*/
  648. movq mm0, C(4) /* mm0 = xC4s4 */
  649. movq mm2, mm1 /* mm2 = is12 - is56 */
  650. movq mm3, mm1 /* mm3 = is12 - is56 */
  651. pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
  652. psrlw mm2, 15
  653. paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
  654. paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
  655. movq mm2, mm7 /* mm2 = id12 + id56 */
  656. movq mm3, mm7 /* mm3 = id12 + id56 */
  657. pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
  658. psrlw mm2, 15 /* For trucation */
  659. paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
  660. paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
  661. /*---------------------------------------------------------*/
  662. pxor mm0, mm0 /* Clear mm0 */
  663. psubsw mm0, mm6 /* mm0 = - id34 */
  664. psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
  665. paddsw mm6, mm6 /* mm6 = id34 * 2 */
  666. paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
  667. psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
  668. paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
  669. paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
  670. /*---------------------------------------------------------*/
  671. /* op1 and op7
  672. /*---------------------------------------------------------*/
  673. movq mm7, C(1) /* xC1S7 */
  674. movq mm2, mm1 /* mm2 = irot_input_x */
  675. movq mm3, mm1; /* mm3 = irot_input_x */
  676. pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
  677. movq mm7, C(7) /* xC7S1 */
  678. psrlw mm2, 15 /* for trucation */
  679. paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
  680. paddw mm1, mm2 /* Trucated */
  681. pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
  682. paddw mm3, mm2 /* Truncated */
  683. movq mm5, mm0 /* mm5 = irot_input_y */
  684. movq mm2, mm0 /* mm2 = irot_input_y */
  685. movq mm7, C(1) /* xC1S7 */
  686. pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
  687. movq mm7, C(7) /* xC7S1 */
  688. psrlw mm2, 15 /* for trucation */
  689. paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
  690. paddw mm0, mm2 /* Truncated */
  691. pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
  692. paddw mm5, mm2 /* Truncated */
  693. psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
  694. paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
  695. movq mm5, mm1
  696. movq mm0, mm3
  697. psrlw mm1, 15
  698. psrlw mm3, 15
  699. paddw mm1, mm5
  700. paddw mm3, mm0
  701. psraw mm1, 1
  702. psraw mm3, 1
  703. movq OL(1), mm1
  704. movq OH(3), mm3
  705. /*---------------------------------------------------------*/
  706. /* op3 and op5
  707. /*---------------------------------------------------------*/
  708. movq mm0, C(3) /* xC3S5 */
  709. movq mm1, C(5) /* xC5S3 */
  710. movq mm5,mm6 /* irot_input_x */
  711. movq mm7,mm6 /* irot_input_x */
  712. movq mm2,mm4 /* irot_input_y */
  713. movq mm3,mm4 /* irot_input_y */
  714. pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
  715. pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
  716. psrlw mm2,15 /* for trucation */
  717. psrlw mm5,15 /* for trucation */
  718. paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
  719. paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
  720. paddw mm4,mm2 /* Truncated */
  721. paddw mm6,mm5 /* Truncated */
  722. psubsw mm4,mm6 /* op [3] */
  723. movq mm6,mm4
  724. psrlw mm4,15
  725. paddw mm4,mm6
  726. psraw mm4,1
  727. movq OL(3),mm4 /* Save Op[3] */
  728. movq mm4,mm3 /* irot_input_y */
  729. movq mm6,mm7 /* irot_input_x */
  730. pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
  731. pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
  732. paddw mm4,mm2 /* Trucated */
  733. paddw mm6,mm5 /* Trucated */
  734. paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
  735. paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
  736. paddw mm3,mm7 /* Op[5] */
  737. movq mm7,mm3
  738. psrlw mm3,15
  739. paddw mm3,mm7
  740. psraw mm3,1
  741. movq OH(1),mm3 /* Save Op[5] */
  742. /*---------------------------------------------------------*/
  743. /* End of 4x8 1-D FDCT */
  744. /*---------------------------------------------------------*/
  745. lea eax, [eax+8]
  746. lea ebx, [ebx+8]
  747. /******************************************************/
  748. /* Do 4x8 Transpose is done through 2 4x4 Transpose */
  749. /******************************************************/
  750. movq mm4, IH(0) /* mm4=e3e2e1e0 */
  751. movq mm0, IH(1) /* mm4=f3f2f1f0 */
  752. movq mm5, mm4 /* make a copy */
  753. punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
  754. punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
  755. movq mm6, IH(2) /* mm6=g3g2g1g0 */
  756. movq mm0, IH(3) /* mm0=h3h2h1h0 */
  757. movq mm7, mm6 /* mm7=g3g2g1g0 */
  758. punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
  759. punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
  760. movq mm3, mm4 /* mm4=f1e1f0e0 */
  761. punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
  762. punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
  763. movq mm6, mm5 /* mm5=f3e3f2e2 */
  764. punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
  765. movq IH(0), mm4 /* saveh0g0f0e0 */
  766. punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
  767. movq IH(2), mm5 /* saveh2g2f2e2 */
  768. movq IH(3), mm6 /* saveh3g3f3e3 */
  769. /*----------------------------------------------------*/
  770. /* mm3 in use for IH(1) */
  771. /*----------------------------------------------------*/
  772. movq mm4, IL(0) /* mm4=a3a2a1a0 */
  773. movq mm0, IL(1) /* mm0=b3b2b1b0 */
  774. movq mm5, mm4 /* mm5=a3a2a1a0 */
  775. punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
  776. punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
  777. movq mm6, IL(2) /* mm6=c3c2c1c0 */
  778. movq mm0, IL(3) /* mm0=d3d2d1d0 */
  779. movq mm7, mm6 /* mm7=c3c2c1c0 */
  780. punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
  781. punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
  782. movq mm1, mm4 /* mm4=b1a1b0a0 */
  783. punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
  784. punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
  785. movq mm2, mm5 /* mm5=b3a3b2a2 */
  786. punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
  787. punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
  788. movq IL(2), mm5 /* saved2c2b2a2 */
  789. /*----------------------------------------------------*/
  790. /* mm1 in use for IL(1) */
  791. /* mm2 in use for IL(3) */
  792. /* mm3 in use for IH(1) */
  793. /* mm4 in use for IH(0) */
  794. /*----------------------------------------------------*/
  795. /******************************************************/
  796. /* Let's do the 4x8 forward DCT */
  797. /******************************************************/
  798. movq mm0, mm4 /* mm4 = ip0 */
  799. movq mm5, mm1 /* mm5 = ip1 */
  800. movq mm6, mm2 /* mm6 = ip3 */
  801. movq mm7, mm3 /* mm7 = ip5 */
  802. paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
  803. paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
  804. paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
  805. paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
  806. psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
  807. psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
  808. psubsw mm0, mm2 /* mm0 = is07 - is34 */
  809. paddsw mm2, mm2 /* mm2 = is34 * 2 */
  810. psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
  811. paddsw mm2, mm0 /* mm2 = is07 + is34 */
  812. psubsw mm1, mm3 /* mm1 = is12 - is56 */
  813. movq TIRY, mm0 /* save is07-is34 */
  814. paddsw mm3, mm3 /* mm3 = is56 * 2 */
  815. paddsw mm3, mm1 /* mm3 = is12 + is56 */
  816. psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
  817. psubsw mm5, mm7 /* mm5 = id12 - id56 */
  818. paddsw mm7, mm7 /* mm7 = id56 * 2 */
  819. paddsw mm7, mm5 /* mm7 = id12 + id56 */
  820. /*---------------------------------------------------------*/
  821. /* op0 and op4
  822. /*---------------------------------------------------------*/
  823. psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
  824. paddsw mm3, mm3 /* mm3 = is1256 * 2 */
  825. movq mm0, mm2 /* mm0 = is0734 - is1256 */
  826. paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
  827. pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
  828. paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
  829. psrlw mm2, 15
  830. paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
  831. movq mm2, mm0
  832. psrlw mm0, 15
  833. paddw mm0, mm2
  834. psraw mm0, 1
  835. movq OH(0), mm0 /* op4, now mm0,mm2 are free */
  836. movq mm2, mm3 /* mm2 = is0734 + is1256 */
  837. movq mm0, mm3 /* mm0 = is0734 + is1256 */
  838. pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
  839. psrlw mm2, 15
  840. paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
  841. paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
  842. movq mm2, mm3
  843. psrlw mm3, 15
  844. paddw mm3, mm2
  845. psraw mm3, 1
  846. movq OL(0), mm3 /* save op0 */
  847. /*---------------------------------------------------------*/
  848. /* op2 and op6
  849. /*---------------------------------------------------------*/
  850. movq mm3, TIRY /* mm3 = irot_input_y */
  851. pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
  852. movq mm2, TIRY /* mm2 = irot_input_y */
  853. movq mm0, mm2 /* mm0 = irot_input_y */
  854. psrlw mm2, 15
  855. paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
  856. paddw mm3, mm2 /* Truncated */
  857. movq mm0, mm5 /* mm0 = id12 - id56 */
  858. movq mm2, mm5 /* mm2 = id12 - id56 */
  859. pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
  860. psrlw mm2, 15
  861. paddw mm0, mm2 /* Truncated */
  862. paddsw mm3, mm0 /* op[2] */
  863. movq mm0, mm3
  864. psrlw mm3, 15
  865. paddw mm3, mm0
  866. psraw mm3, 1
  867. movq OL(2), mm3 /* save op[2] */
  868. movq mm0, mm5 /* mm0 = id12 - id56 */
  869. movq mm2, mm5 /* mm0 = id12 - id56 */
  870. pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
  871. psrlw mm2, 15
  872. movq mm3, TIRY /* mm3 = irot_input_y */
  873. paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
  874. paddw mm5, mm2 /* Truncated */
  875. movq mm2, mm3 /* mm2 = irot_input_y */
  876. pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
  877. psrlw mm2, 15
  878. paddw mm3, mm2 /* Truncated */
  879. psubsw mm3, mm5 /* mm3 = op[6] */
  880. movq mm5, mm3
  881. psrlw mm3, 15
  882. paddw mm3, mm5
  883. psraw mm3, 1
  884. movq OH(2), mm3
  885. /*-----------------------------------------------------------------------*/
  886. /* icommon_product1, icommon_product2 */
  887. /*-----------------------------------------------------------------------*/
  888. movq mm0, C(4) /* mm0 = xC4s4 */
  889. movq mm2, mm1 /* mm2 = is12 - is56 */
  890. movq mm3, mm1 /* mm3 = is12 - is56 */
  891. pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
  892. psrlw mm2, 15
  893. paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
  894. paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
  895. movq mm2, mm7 /* mm2 = id12 + id56 */
  896. movq mm3, mm7 /* mm3 = id12 + id56 */
  897. pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
  898. psrlw mm2, 15 /* For trucation */
  899. paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
  900. paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
  901. /*---------------------------------------------------------*/
  902. pxor mm0, mm0 /* Clear mm0 */
  903. psubsw mm0, mm6 /* mm0 = - id34 */
  904. psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
  905. paddsw mm6, mm6 /* mm6 = id34 * 2 */
  906. paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
  907. psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
  908. paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
  909. paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
  910. /*---------------------------------------------------------*/
  911. /* op1 and op7
  912. /*---------------------------------------------------------*/
  913. movq mm7, C(1) /* xC1S7 */
  914. movq mm2, mm1 /* mm2 = irot_input_x */
  915. movq mm3, mm1; /* mm3 = irot_input_x */
  916. pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
  917. movq mm7, C(7) /* xC7S1 */
  918. psrlw mm2, 15 /* for trucation */
  919. paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
  920. paddw mm1, mm2 /* Trucated */
  921. pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
  922. paddw mm3, mm2 /* Truncated */
  923. movq mm5, mm0 /* mm5 = irot_input_y */
  924. movq mm2, mm0 /* mm2 = irot_input_y */
  925. movq mm7, C(1) /* xC1S7 */
  926. pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
  927. movq mm7, C(7) /* xC7S1 */
  928. psrlw mm2, 15 /* for trucation */
  929. paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
  930. paddw mm0, mm2 /* Truncated */
  931. pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
  932. paddw mm5, mm2 /* Truncated */
  933. psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
  934. paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
  935. movq mm5, mm1
  936. movq mm0, mm3
  937. psrlw mm1, 15
  938. psrlw mm3, 15
  939. paddw mm1, mm5
  940. paddw mm3, mm0
  941. psraw mm1, 1
  942. psraw mm3, 1
  943. movq OL(1), mm1
  944. movq OH(3), mm3
  945. /*---------------------------------------------------------*/
  946. /* op3 and op5
  947. /*---------------------------------------------------------*/
  948. movq mm0, C(3) /* xC3S5 */
  949. movq mm1, C(5) /* xC5S3 */
  950. movq mm5,mm6 /* irot_input_x */
  951. movq mm7,mm6 /* irot_input_x */
  952. movq mm2,mm4 /* irot_input_y */
  953. movq mm3,mm4 /* irot_input_y */
  954. pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
  955. pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
  956. psrlw mm2,15 /* for trucation */
  957. psrlw mm5,15 /* for trucation */
  958. paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
  959. paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
  960. paddw mm4,mm2 /* Truncated */
  961. paddw mm6,mm5 /* Truncated */
  962. psubsw mm4,mm6 /* op [3] */
  963. movq mm6,mm4
  964. psrlw mm4,15
  965. paddw mm4,mm6
  966. psraw mm4,1
  967. movq OL(3),mm4 /* Save Op[3] */
  968. movq mm4,mm3 /* irot_input_y */
  969. movq mm6,mm7 /* irot_input_x */
  970. pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
  971. pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
  972. paddw mm4,mm2 /* Trucated */
  973. paddw mm6,mm5 /* Trucated */
  974. paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
  975. paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
  976. paddw mm3,mm7 /* Op[5] */
  977. movq mm7,mm3
  978. psrlw mm3,15
  979. paddw mm3,mm7
  980. psraw mm3,1
  981. movq OH(1),mm3 /* Save Op[5] */
  982. /*---------------------------------------------------------*/
  983. /* End of 4x8 1-D FDCT */
  984. /*---------------------------------------------------------*/
  985. }/* end of _asm code section */
  986. }