123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810 |
- /****************************************************************************
- *
- * Module Title : Fdctwmt.c
- *
- * Description : Forward DCT optimized specifically for Intel P4
- * processor
- *
- * AUTHOR : YaoWu Xu
- *
- *****************************************************************************
- * Revision History
- *
- * 1.00 YWX 03/11/02 Configuration baseline
- *
- *****************************************************************************
- */
- /*******************************************************************************
- * Module Constants
- *******************************************************************************
- */
-
- __declspec(align(16)) static unsigned short TIRY[8];
- __declspec(align(16)) static unsigned short WmtIdctConst[8 * 8] =
- {
- 0, 0, 0, 0, 0, 0, 0, 0,
- 64277,64277,64277,64277,64277,64277,64277,64277,
- 60547,60547,60547,60547,60547,60547,60547,60547,
- 54491,54491,54491,54491,54491,54491,54491,54491,
- 46341,46341,46341,46341,46341,46341,46341,46341,
- 36410,36410,36410,36410,36410,36410,36410,36410,
- 25080,25080,25080,25080,25080,25080,25080,25080,
- 12785,12785,12785,12785,12785,12785,12785,12785
- };
-
- /**************************************************************************************
- *
- * Macro: FDct_WMT
- *
- * Description: The Macro does 1-D IDct on 8 columns.
- *
- * Input: None
- *
- * Output: None
- *
- * Return: None
- *
- * Special Note: None
- *
- * Error: None
- *
- ***************************************************************************************
- */
- void fdct_WMT(short *InputData, short *OutputData)
- {
- __asm
- {
- mov eax, InputData
- mov ebx, OutputData
- lea edx, WmtIdctConst
- #define I(i) [eax + 16 * i ]
- #define O(i) [ebx + 16 * i ]
- #define C(i) [edx + 16 * i ]
- /******************************************************/
- /* Do 8x8 Transpose */
- /******************************************************/
- movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
- movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
-
- psllw xmm4, 1
- psllw xmm0, 1
- movdqa xmm5, xmm4 /* make a copy */
- punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
-
- punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
- movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
-
- movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
- psllw xmm6, 1
- psllw xmm0, 1
- movdqa xmm7, xmm6 /* make a copy */
-
- punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
- punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
-
- movdqa xmm3, xmm4 /* make a copy */
- punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
-
- punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
- movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
- /* Free xmm6 */
- movdqa xmm6, xmm5 /* make a copy */
- punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
-
- punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
- movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
- /* Free xmm7 */
- movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
- psllw xmm0, 1
- psllw xmm1, 1
-
- movdqa xmm7, xmm0 /* make a copy */
-
- punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
- punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
- /* Free xmm1 */
- movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
- movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
-
- psllw xmm2, 1
- psllw xmm3, 1
- movdqa xmm1, xmm2 /* make a copy */
- punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
-
- punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
- movdqa xmm3, xmm0 /* make a copy */
-
- punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
- punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
- /* Free xmm2 */
- movdqa xmm2, xmm7 /* make a copy */
- punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
-
- punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
- movdqa xmm1, xmm0 /* make a copy */
-
- punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
- punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
-
- movdqa I(0), xmm0 /* save I(0) */
- movdqa I(1), xmm1 /* save I(1) */
-
- movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
- movdqa xmm1, xmm3 /* make a copy */
-
- punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
- punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
-
- movdqa xmm4, xmm2 /* make a copy */
- punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
-
- punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
- movdqa I(2), xmm1 /* save I(2) */
-
- movdqa I(3), xmm3 /* save I(3) */
- movdqa I(4), xmm4 /* save I(4) */
-
- movdqa I(5), xmm2 /* save I(5) */
- movdqa xmm5, xmm7 /* make a copy */
-
- punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
- punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
-
- movdqa I(6), xmm5 /* save I(6) */
- movdqa I(7), xmm7 /* save I(7) */
- /******************************************************/
- /* Done with transpose - Let's do the forward DCT */
- /******************************************************/
- movdqa xmm0, I(0) /* xmm0 = ip0 */
- movdqa xmm1, I(1) /* xmm1 = ip1 */
- movdqa xmm2, I(3) /* xmm2 = ip3 */
- movdqa xmm3, I(5) /* xmm3 = ip5 */
- movdqa xmm4, xmm0 /* xmm4 = ip0 */
- movdqa xmm5, xmm1 /* xmm5 = ip1 */
-
- movdqa xmm6, xmm2 /* xmm6 = ip3 */
- movdqa xmm7, xmm3 /* xmm7 = ip5 */
- paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
- paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
- paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
- paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
- psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
- psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
- psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
- paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
-
- psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
- paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
- psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
- movdqa TIRY, xmm0 /* save is07-is34 */
- paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
- paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
-
- psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
- psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
-
- paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
- paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
- /*---------------------------------------------------------*/
- /* op0 and op4
- /*---------------------------------------------------------*/
- psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
- paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
- movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
- paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
- pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw xmm2, 15
- paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
-
- movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
- movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
-
- movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
- pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-
- psrlw xmm2, 15
- paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
-
- paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
- movdqa O(0), xmm3 /* save op0 */
- /*---------------------------------------------------------*/
- /* op2 and op6
- /*---------------------------------------------------------*/
- movdqa xmm3, TIRY /* xmm3 = irot_input_y */
- pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
-
- movdqa xmm2, TIRY /* xmm2 = irot_input_y */
- movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
-
- psrlw xmm2, 15
- paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
-
- paddw xmm3, xmm2 /* Truncated */
- movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
-
-
- movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
- pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
-
- psrlw xmm2, 15
- paddw xmm0, xmm2 /* Truncated */
-
- paddsw xmm3, xmm0 /* op[2] */
- movdqa O(2), xmm3 /* save op[2] */
-
-
- movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
- movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
-
- pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw xmm2, 15
-
- movdqa xmm3, TIRY /* xmm3 = irot_input_y */
- paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
-
- paddw xmm5, xmm2 /* Truncated */
- movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
-
- pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
- psrlw xmm2, 15
-
- paddw xmm3, xmm2 /* Truncated */
- psubsw xmm3, xmm5 /* xmm3 = op[6] */
-
- movdqa O(6), xmm3
- /*-----------------------------------------------------------------------*/
- /* icommon_product1, icommon_product2 */
- /*-----------------------------------------------------------------------*/
- movdqa xmm0, C(4) /* xmm0 = xC4s4 */
- movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
-
- movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
- pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-
- psrlw xmm2, 15
- paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
-
- paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
- movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
-
- movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
- pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-
- psrlw xmm2, 15 /* For trucation */
- paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
- paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
- /*---------------------------------------------------------*/
- pxor xmm0, xmm0 /* Clear xmm0 */
- psubsw xmm0, xmm6 /* xmm0 = - id34 */
- psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
- paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
- paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
- psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
- paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
- paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
- /*---------------------------------------------------------*/
- /* op1 and op7
- /*---------------------------------------------------------*/
- movdqa xmm7, C(1) /* xC1S7 */
- movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
-
- movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
- pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
-
- movdqa xmm7, C(7) /* xC7S1 */
- psrlw xmm2, 15 /* for trucation */
-
- paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
- paddw xmm1, xmm2 /* Trucated */
-
- pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
- paddw xmm3, xmm2 /* Truncated */
-
- movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
- movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
-
- movdqa xmm7, C(1) /* xC1S7 */
- pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
-
- movdqa xmm7, C(7) /* xC7S1 */
- psrlw xmm2, 15 /* for trucation */
-
- paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
- paddw xmm0, xmm2 /* Truncated */
-
- pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
- paddw xmm5, xmm2 /* Truncated */
-
- psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
- paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
-
- movdqa O(1), xmm1
- movdqa O(7), xmm3
- /*---------------------------------------------------------*/
- /* op3 and op5
- /*---------------------------------------------------------*/
- movdqa xmm0, C(3) /* xC3S5 */
- movdqa xmm1, C(5) /* xC5S3 */
- movdqa xmm5,xmm6 /* irot_input_x */
- movdqa xmm7,xmm6 /* irot_input_x */
- movdqa xmm2,xmm4 /* irot_input_y */
- movdqa xmm3,xmm4 /* irot_input_y */
- pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
- psrlw xmm2,15 /* for trucation */
- psrlw xmm5,15 /* for trucation */
- paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
- paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
- paddw xmm4,xmm2 /* Truncated */
- paddw xmm6,xmm5 /* Truncated */
- psubsw xmm4,xmm6 /* op [3] */
- movdqa O(3),xmm4 /* Save Op[3] */
- movdqa xmm4,xmm3 /* irot_input_y */
- movdqa xmm6,xmm7 /* irot_input_x */
- pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- paddw xmm4,xmm2 /* Trucated */
- paddw xmm6,xmm5 /* Trucated */
- paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
- paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
- paddw xmm3,xmm7 /* Op[5] */
- movdqa O(5),xmm3 /* Save Op[5] */
- /*---------------------------------------------------------*/
- /* End of 8 1-D FDCT */
- /*---------------------------------------------------------*/
- #undef I
- #undef O
- #define I(i) [ebx + 16 * i ]
- #define O(i) [ebx + 16 * i ]
- /******************************************************/
- /* Do 8x8 Transpose */
- /******************************************************/
- movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
- movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
-
- movdqa xmm5, xmm4 /* make a copy */
- punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
-
- punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
- movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
-
- movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
- movdqa xmm7, xmm6 /* make a copy */
-
- punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
- punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
-
- movdqa xmm3, xmm4 /* make a copy */
- punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
-
- punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
- movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
- /* Free xmm6 */
- movdqa xmm6, xmm5 /* make a copy */
- punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
-
- punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
- movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
- /* Free xmm7 */
- movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
- movdqa xmm7, xmm0 /* make a copy */
-
- punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
- punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
- /* Free xmm1 */
- movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
- movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
-
- movdqa xmm1, xmm2 /* make a copy */
- punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
-
- punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
- movdqa xmm3, xmm0 /* make a copy */
-
- punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
- punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
- /* Free xmm2 */
- movdqa xmm2, xmm7 /* make a copy */
- punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
-
- punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
- movdqa xmm1, xmm0 /* make a copy */
-
- punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
- punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
-
- movdqa I(0), xmm0 /* save I(0) */
- movdqa I(1), xmm1 /* save I(1) */
-
- movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
- movdqa xmm1, xmm3 /* make a copy */
-
- punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
- punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
-
- movdqa xmm4, xmm2 /* make a copy */
- punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
-
- punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
- movdqa I(2), xmm1 /* save I(2) */
-
- movdqa I(3), xmm3 /* save I(3) */
- movdqa I(4), xmm4 /* save I(4) */
-
- movdqa I(5), xmm2 /* save I(5) */
- movdqa xmm5, xmm7 /* make a copy */
-
- punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
- punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
-
- movdqa I(6), xmm5 /* save I(6) */
- movdqa I(7), xmm7 /* save I(7) */
- /******************************************************/
- /* Done with transpose - Let's do the forward DCT */
- /******************************************************/
- movdqa xmm0, I(0) /* xmm0 = ip0 */
- movdqa xmm1, I(1) /* xmm1 = ip1 */
- movdqa xmm2, I(3) /* xmm2 = ip3 */
- movdqa xmm3, I(5) /* xmm3 = ip5 */
- movdqa xmm4, xmm0 /* xmm4 = ip0 */
- movdqa xmm5, xmm1 /* xmm5 = ip1 */
-
- movdqa xmm6, xmm2 /* xmm6 = ip3 */
- movdqa xmm7, xmm3 /* xmm7 = ip5 */
- paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
- paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
- paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
- paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
- psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
- psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
- psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
- paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
-
- psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
- paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
- psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
- movdqa TIRY, xmm0 /* save is07-is34 */
- paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
- paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
-
- psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
- psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
-
- paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
- paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
- /*---------------------------------------------------------*/
- /* op0 and op4
- /*---------------------------------------------------------*/
- #if 0
- movdqa xmm0, xmm2 /* xmm0 =xmm2= is0734 */
- pmulhw xmm2, C(4) /* xC4S4 * is0734 - is0734 */
-
- paddw xmm2, xmm0 /* XC4S4 * is0734 */
- movdqa xmm0, xmm3 /* xmm0 =xmm3= is1256 */
- pmulhw xmm3, C(4) /* xC4S4 * is1256 - is1256 */
- paddw xmm3, xmm0 /* xC4S4 * is1256 */
- movdqa xmm0, xmm2
- paddsw xmm2, xmm3 /* xC4S4 * ( is0734 +is1256 ) */
- psubsw xmm0, xmm3 /* xC4S4 * ( is0734 -is1256 ) */
- movdqa xmm3, xmm2
-
- psrlw xmm2, 15
- paddsw xmm3, xmm2
- movdqa xmm2, xmm0
- movdqa O(0), xmm3
-
- psrlw xmm0, 15
- paddsw xmm2, xmm0
- movdqa O(4), xmm2
- #else
- psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
- paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
- movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
- paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
- pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw xmm2, 15
- paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
-
- movdqa xmm2, xmm0
- psrlw xmm0, 15
-
- paddw xmm0, xmm2
- psraw xmm0, 1
-
- movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
- movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
-
-
- movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
- pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-
- psrlw xmm2, 15
- paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
-
- paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
- movdqa xmm2, xmm3
- psrlw xmm3, 15
- paddw xmm3, xmm2
-
- psraw xmm3, 1
- movdqa O(0), xmm3 /* save op0 */
- #endif
- /*---------------------------------------------------------*/
- /* op2 and op6
- /*---------------------------------------------------------*/
- movdqa xmm3, TIRY /* xmm3 = irot_input_y */
- pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
-
- movdqa xmm2, TIRY /* xmm2 = irot_input_y */
- movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
-
- psrlw xmm2, 15
- paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
-
- paddw xmm3, xmm2 /* Truncated */
- movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
-
-
- movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
- pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
-
- psrlw xmm2, 15
- paddw xmm0, xmm2 /* Truncated */
-
- paddsw xmm3, xmm0 /* op[2] */
- movdqa xmm0, xmm3
- psrlw xmm3, 15
- paddw xmm3, xmm0
- psraw xmm3, 1
- movdqa O(2), xmm3 /* save op[2] */
-
-
- movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
- movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
-
- pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw xmm2, 15
-
- movdqa xmm3, TIRY /* xmm3 = irot_input_y */
- paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
-
- paddw xmm5, xmm2 /* Truncated */
- movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
-
- pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
- psrlw xmm2, 15
-
- paddw xmm3, xmm2 /* Truncated */
- psubsw xmm3, xmm5 /* xmm3 = op[6] */
-
- movdqa xmm5, xmm3
- psrlw xmm3, 15
-
- paddw xmm3, xmm5
- psraw xmm3, 1
-
- movdqa O(6), xmm3
- /*-----------------------------------------------------------------------*/
- /* icommon_product1, icommon_product2 */
- /*-----------------------------------------------------------------------*/
- movdqa xmm0, C(4) /* xmm0 = xC4s4 */
- movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
-
- movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
- pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-
- psrlw xmm2, 15
- paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
-
- paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
- movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
-
- movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
- pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-
- psrlw xmm2, 15 /* For trucation */
- paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
- paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
- /*---------------------------------------------------------*/
- pxor xmm0, xmm0 /* Clear xmm0 */
- psubsw xmm0, xmm6 /* xmm0 = - id34 */
- psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
- paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
- paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
- psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
- paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
- paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
- /*---------------------------------------------------------*/
- /* op1 and op7
- /*---------------------------------------------------------*/
- movdqa xmm7, C(1) /* xC1S7 */
- movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
-
- movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
- pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
-
- movdqa xmm7, C(7) /* xC7S1 */
- psrlw xmm2, 15 /* for trucation */
-
- paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
- paddw xmm1, xmm2 /* Trucated */
-
- pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
- paddw xmm3, xmm2 /* Truncated */
-
- movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
- movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
-
- movdqa xmm7, C(1) /* xC1S7 */
- pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
-
- movdqa xmm7, C(7) /* xC7S1 */
- psrlw xmm2, 15 /* for trucation */
-
- paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
- paddw xmm0, xmm2 /* Truncated */
-
- pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
- paddw xmm5, xmm2 /* Truncated */
-
- psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
- paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
- movdqa xmm5, xmm1
- movdqa xmm0, xmm3
- psrlw xmm1, 15
- psrlw xmm3, 15
- paddw xmm1, xmm5
- paddw xmm3, xmm0
- psraw xmm1, 1
- psraw xmm3, 1
-
- movdqa O(1), xmm1
- movdqa O(7), xmm3
- /*---------------------------------------------------------*/
- /* op3 and op5
- /*---------------------------------------------------------*/
- movdqa xmm0, C(3) /* xC3S5 */
- movdqa xmm1, C(5) /* xC5S3 */
- movdqa xmm5,xmm6 /* irot_input_x */
- movdqa xmm7,xmm6 /* irot_input_x */
- movdqa xmm2,xmm4 /* irot_input_y */
- movdqa xmm3,xmm4 /* irot_input_y */
- pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
- psrlw xmm2,15 /* for trucation */
- psrlw xmm5,15 /* for trucation */
- paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
- paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
- paddw xmm4,xmm2 /* Truncated */
- paddw xmm6,xmm5 /* Truncated */
- psubsw xmm4,xmm6 /* op [3] */
- movdqa xmm6,xmm4
- psrlw xmm4,15
- paddw xmm4,xmm6
- psraw xmm4,1
- movdqa O(3),xmm4 /* Save Op[3] */
- movdqa xmm4,xmm3 /* irot_input_y */
- movdqa xmm6,xmm7 /* irot_input_x */
- pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- paddw xmm4,xmm2 /* Trucated */
- paddw xmm6,xmm5 /* Trucated */
- paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
- paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
- paddw xmm3,xmm7 /* Op[5] */
- movdqa xmm7,xmm3
- psrlw xmm3,15
- paddw xmm3,xmm7
- psraw xmm3,1
- movdqa O(5),xmm3 /* Save Op[5] */
- /*---------------------------------------------------------*/
- /* End of 8 1-D FDCT */
- /*---------------------------------------------------------*/
- }/* end of _asm code section */
- }
|