123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398 |
- /****************************************************************************
- *
- * Module Title : fdctmmx.c
- *
- * Description : Forward DCT optimized specifically for mmx or compatible
- * processor
- *
- * AUTHOR : Yaowu Xu
- *
- *****************************************************************************
- * Revision History
- *
- * 1.00 YWX 07/11/11 Configuration baseline
- *
- *****************************************************************************
- */
- /*******************************************************************************
- * Module Constants
- *******************************************************************************
- */
-
- __declspec(align(16)) static unsigned short TIRY[8];
- __declspec(align(16)) static unsigned short MmxIdctConst[8 * 4] =
- {
- 0, 0, 0, 0,
- 64277,64277,64277,64277,
- 60547,60547,60547,60547,
- 54491,54491,54491,54491,
- 46341,46341,46341,46341,
- 36410,36410,36410,36410,
- 25080,25080,25080,25080,
- 12785,12785,12785,12785
- };
-
- /**************************************************************************************
- *
- * Macro: fdct_MMX
- *
- * Description: The Macro does 1-D IDct on 8 columns.
- *
- * Input: None
- *
- * Output: None
- *
- * Return: None
- *
- * Special Note: The inputdata is limited to 9 bits [-256, 255]
- *
- * Error: None
- *
- ***************************************************************************************
- */
- void fdct_MMX(short *InputData, short *OutputData)
- {
- __asm
- {
- mov eax, InputData
- mov ebx, OutputData
- lea ecx, [eax+8]
- lea edi, [ebx+8]
- lea edx, MmxIdctConst
- #define IL(i) [eax + 16 * i]
- #define IH(i) [ecx + 16 * i]
- #define OL(i) [ebx + 16 * i]
- #define OH(i) [edi + 16 * i]
- #define C(i) [edx + 8 * i]
- /******************************************************/
- /* Do 4x8 Transpose is done through 2 4x4 Transpose */
- /******************************************************/
- movq mm4, IH(0) /* mm4=e3e2e1e0 */
- movq mm0, IH(1) /* mm4=f3f2f1f0 */
-
- psllw mm4, 1 /* up precision */
- psllw mm0, 1 /* up precision */
- movq mm5, mm4 /* make a copy */
- punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
-
- punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
- movq mm6, IH(2) /* mm6=g3g2g1g0 */
- movq mm0, IH(3) /* mm0=h3h2h1h0 */
- psllw mm6, 1 /* up precision */
- psllw mm0, 1 /* up precision */
- movq mm7, mm6 /* mm7=g3g2g1g0 */
-
- punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
- punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
-
- movq mm3, mm4 /* mm4=f1e1f0e0 */
- punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
-
- punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
- movq mm6, mm5 /* mm5=f3e3f2e2 */
- punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
- movq IH(0), mm4 /* saveh0g0f0e0 */
-
- punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
- movq IH(2), mm5 /* saveh2g2f2e2 */
- movq IH(3), mm6 /* saveh3g3f3e3 */
- /*----------------------------------------------------*/
- /* mm3 in use for IH(1) */
- /*----------------------------------------------------*/
- movq mm4, IL(0) /* mm4=a3a2a1a0 */
- movq mm0, IL(1) /* mm0=b3b2b1b0 */
- psllw mm4, 1 /* up precision */
- psllw mm0, 1 /* up precision */
-
- movq mm5, mm4 /* mm5=a3a2a1a0 */
- punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
-
- punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
- movq mm6, IL(2) /* mm6=c3c2c1c0 */
-
-
- movq mm0, IL(3) /* mm0=d3d2d1d0 */
- psllw mm6, 1 /* up precision */
- psllw mm0, 1 /* up precision */
- movq mm7, mm6 /* mm7=c3c2c1c0 */
- punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
- punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
-
- movq mm1, mm4 /* mm4=b1a1b0a0 */
- punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
-
- punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
- movq mm2, mm5 /* mm5=b3a3b2a2 */
- punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
- punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
-
- movq IL(2), mm5 /* saved2c2b2a2 */
- /*----------------------------------------------------*/
- /* mm1 in use for IL(1) */
- /* mm2 in use for IL(3) */
- /* mm3 in use for IH(1) */
- /* mm4 in use for IH(0) */
- /*----------------------------------------------------*/
- /******************************************************/
- /* Let's do the 4x8 forward DCT */
- /******************************************************/
- movq mm0, mm4 /* mm4 = ip0 */
- movq mm5, mm1 /* mm5 = ip1 */
-
- movq mm6, mm2 /* mm6 = ip3 */
- movq mm7, mm3 /* mm7 = ip5 */
- paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
- paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
- paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
- paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
- psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
- psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
- psubsw mm0, mm2 /* mm0 = is07 - is34 */
- paddsw mm2, mm2 /* mm2 = is34 * 2 */
-
- psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
- paddsw mm2, mm0 /* mm2 = is07 + is34 */
- psubsw mm1, mm3 /* mm1 = is12 - is56 */
- movq TIRY, mm0 /* save is07-is34 */
- paddsw mm3, mm3 /* mm3 = is56 * 2 */
- paddsw mm3, mm1 /* mm3 = is12 + is56 */
-
- psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
- psubsw mm5, mm7 /* mm5 = id12 - id56 */
-
- paddsw mm7, mm7 /* mm7 = id56 * 2 */
- paddsw mm7, mm5 /* mm7 = id12 + id56 */
- /*---------------------------------------------------------*/
- /* op0 and op4
- /*---------------------------------------------------------*/
- psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
- paddsw mm3, mm3 /* mm3 = is1256 * 2 */
- movq mm0, mm2 /* mm0 = is0734 - is1256 */
- paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
- pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
-
- movq mm2, mm3 /* mm2 = is0734 + is1256 */
- movq OH(0), mm0 /* op4, now mm0,mm2 are free */
-
- movq mm0, mm3 /* mm0 = is0734 + is1256 */
- pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
-
- paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
- movq OL(0), mm3 /* save op0 */
- /*---------------------------------------------------------*/
- /* op2 and op6
- /*---------------------------------------------------------*/
- movq mm3, TIRY /* mm3 = irot_input_y */
- pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
-
- movq mm2, TIRY /* mm2 = irot_input_y */
- movq mm0, mm2 /* mm0 = irot_input_y */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
-
- paddw mm3, mm2 /* Truncated */
- movq mm0, mm5 /* mm0 = id12 - id56 */
-
-
- movq mm2, mm5 /* mm2 = id12 - id56 */
- pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
-
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncated */
-
- paddsw mm3, mm0 /* op[2] */
- movq OL(2), mm3 /* save op[2] */
-
-
- movq mm0, mm5 /* mm0 = id12 - id56 */
- movq mm2, mm5 /* mm0 = id12 - id56 */
-
- pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw mm2, 15
-
- movq mm3, TIRY /* mm3 = irot_input_y */
- paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
-
- paddw mm5, mm2 /* Truncated */
- movq mm2, mm3 /* mm2 = irot_input_y */
-
- pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
- psrlw mm2, 15
-
- paddw mm3, mm2 /* Truncated */
- psubsw mm3, mm5 /* mm3 = op[6] */
-
- movq OH(2), mm3
- /*-----------------------------------------------------------------------*/
- /* icommon_product1, icommon_product2 */
- /*-----------------------------------------------------------------------*/
- movq mm0, C(4) /* mm0 = xC4s4 */
- movq mm2, mm1 /* mm2 = is12 - is56 */
-
- movq mm3, mm1 /* mm3 = is12 - is56 */
- pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-
- psrlw mm2, 15
- paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
-
- paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
- movq mm2, mm7 /* mm2 = id12 + id56 */
-
- movq mm3, mm7 /* mm3 = id12 + id56 */
- pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-
- psrlw mm2, 15 /* For trucation */
- paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
- paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
- /*---------------------------------------------------------*/
- pxor mm0, mm0 /* Clear mm0 */
- psubsw mm0, mm6 /* mm0 = - id34 */
- psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
- paddsw mm6, mm6 /* mm6 = id34 * 2 */
- paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
- psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
- paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
- paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
- /*---------------------------------------------------------*/
- /* op1 and op7
- /*---------------------------------------------------------*/
- movq mm7, C(1) /* xC1S7 */
- movq mm2, mm1 /* mm2 = irot_input_x */
-
- movq mm3, mm1; /* mm3 = irot_input_x */
- pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
- paddw mm1, mm2 /* Trucated */
-
- pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
- paddw mm3, mm2 /* Truncated */
-
- movq mm5, mm0 /* mm5 = irot_input_y */
- movq mm2, mm0 /* mm2 = irot_input_y */
-
- movq mm7, C(1) /* xC1S7 */
- pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
- paddw mm0, mm2 /* Truncated */
-
- pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
- paddw mm5, mm2 /* Truncated */
-
- psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
- paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
-
- movq OL(1), mm1
- movq OH(3), mm3
- /*---------------------------------------------------------*/
- /* op3 and op5
- /*---------------------------------------------------------*/
- movq mm0, C(3) /* xC3S5 */
- movq mm1, C(5) /* xC5S3 */
- movq mm5,mm6 /* irot_input_x */
- movq mm7,mm6 /* irot_input_x */
- movq mm2,mm4 /* irot_input_y */
- movq mm3,mm4 /* irot_input_y */
- pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
- psrlw mm2,15 /* for trucation */
- psrlw mm5,15 /* for trucation */
- paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
- paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
- paddw mm4,mm2 /* Truncated */
- paddw mm6,mm5 /* Truncated */
- psubsw mm4,mm6 /* op [3] */
- movq OL(3),mm4 /* Save Op[3] */
- movq mm4,mm3 /* irot_input_y */
- movq mm6,mm7 /* irot_input_x */
- pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- paddw mm4,mm2 /* Trucated */
- paddw mm6,mm5 /* Trucated */
- paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
- paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
- paddw mm3,mm7 /* Op[5] */
- movq OH(1),mm3 /* Save Op[5] */
- /*---------------------------------------------------------*/
- /* End of 4x8 1-D FDCT */
- /*---------------------------------------------------------*/
- /******************************************************/
- /* Do 4x8 Transpose is done through 2 4x4 Transpose */
- /******************************************************/
- lea eax, [eax+64]
- lea ecx, [ecx+64]
- lea ebx, [ebx+64]
- lea edi, [edi+64]
- movq mm4, IH(0) /* mm4=e3e2e1e0 */
- movq mm0, IH(1) /* mm4=f3f2f1f0 */
-
- psllw mm4, 1 /* up precision */
- psllw mm0, 1 /* up precision */
- movq mm5, mm4 /* make a copy */
- punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
-
- punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
- movq mm6, IH(2) /* mm6=g3g2g1g0 */
- movq mm0, IH(3) /* mm0=h3h2h1h0 */
- psllw mm6, 1 /* up precision */
- psllw mm0, 1 /* up precision */
- movq mm7, mm6 /* mm7=g3g2g1g0 */
-
- punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
- punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
-
- movq mm3, mm4 /* mm4=f1e1f0e0 */
- punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
-
- punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
- movq mm6, mm5 /* mm5=f3e3f2e2 */
- punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
- movq IH(0), mm4 /* saveh0g0f0e0 */
-
- punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
- movq IH(2), mm5 /* saveh2g2f2e2 */
- movq IH(3), mm6 /* saveh3g3f3e3 */
- /*----------------------------------------------------*/
- /* mm3 in use for IH(1) */
- /*----------------------------------------------------*/
- movq mm4, IL(0) /* mm4=a3a2a1a0 */
- movq mm0, IL(1) /* mm0=b3b2b1b0 */
- psllw mm4, 1 /* up precision */
- psllw mm0, 1 /* up precision */
-
- movq mm5, mm4 /* mm5=a3a2a1a0 */
- punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
-
- punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
- movq mm6, IL(2) /* mm6=c3c2c1c0 */
-
-
- movq mm0, IL(3) /* mm0=d3d2d1d0 */
- psllw mm6, 1 /* up precision */
- psllw mm0, 1 /* up precision */
- movq mm7, mm6 /* mm7=c3c2c1c0 */
- punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
- punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
-
- movq mm1, mm4 /* mm4=b1a1b0a0 */
- punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
-
- punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
- movq mm2, mm5 /* mm5=b3a3b2a2 */
- punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
- punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
-
- movq IL(2), mm5 /* saved2c2b2a2 */
- /*----------------------------------------------------*/
- /* mm1 in use for IL(1) */
- /* mm2 in use for IL(3) */
- /* mm3 in use for IH(1) */
- /* mm4 in use for IH(0) */
- /*----------------------------------------------------*/
- /******************************************************/
- /* Let's do the 4x8 forward DCT */
- /******************************************************/
- movq mm0, mm4 /* mm4 = ip0 */
- movq mm5, mm1 /* mm5 = ip1 */
-
- movq mm6, mm2 /* mm6 = ip3 */
- movq mm7, mm3 /* mm7 = ip5 */
- paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
- paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
- paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
- paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
- psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
- psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
- psubsw mm0, mm2 /* mm0 = is07 - is34 */
- paddsw mm2, mm2 /* mm2 = is34 * 2 */
-
- psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
- paddsw mm2, mm0 /* mm2 = is07 + is34 */
- psubsw mm1, mm3 /* mm1 = is12 - is56 */
- movq TIRY, mm0 /* save is07-is34 */
- paddsw mm3, mm3 /* mm3 = is56 * 2 */
- paddsw mm3, mm1 /* mm3 = is12 + is56 */
-
- psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
- psubsw mm5, mm7 /* mm5 = id12 - id56 */
-
- paddsw mm7, mm7 /* mm7 = id56 * 2 */
- paddsw mm7, mm5 /* mm7 = id12 + id56 */
- /*---------------------------------------------------------*/
- /* op0 and op4
- /*---------------------------------------------------------*/
- psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
- paddsw mm3, mm3 /* mm3 = is1256 * 2 */
- movq mm0, mm2 /* mm0 = is0734 - is1256 */
- paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
- pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
-
- movq mm2, mm3 /* mm2 = is0734 + is1256 */
- movq OH(0), mm0 /* op4, now mm0,mm2 are free */
-
- movq mm0, mm3 /* mm0 = is0734 + is1256 */
- pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
-
- paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
- movq OL(0), mm3 /* save op0 */
- /*---------------------------------------------------------*/
- /* op2 and op6
- /*---------------------------------------------------------*/
- movq mm3, TIRY /* mm3 = irot_input_y */
- pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
-
- movq mm2, TIRY /* mm2 = irot_input_y */
- movq mm0, mm2 /* mm0 = irot_input_y */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
-
- paddw mm3, mm2 /* Truncated */
- movq mm0, mm5 /* mm0 = id12 - id56 */
-
-
- movq mm2, mm5 /* mm2 = id12 - id56 */
- pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
-
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncated */
-
- paddsw mm3, mm0 /* op[2] */
- movq OL(2), mm3 /* save op[2] */
-
-
- movq mm0, mm5 /* mm0 = id12 - id56 */
- movq mm2, mm5 /* mm0 = id12 - id56 */
-
- pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw mm2, 15
-
- movq mm3, TIRY /* mm3 = irot_input_y */
- paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
-
- paddw mm5, mm2 /* Truncated */
- movq mm2, mm3 /* mm2 = irot_input_y */
-
- pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
- psrlw mm2, 15
-
- paddw mm3, mm2 /* Truncated */
- psubsw mm3, mm5 /* mm3 = op[6] */
-
- movq OH(2), mm3
- /*-----------------------------------------------------------------------*/
- /* icommon_product1, icommon_product2 */
- /*-----------------------------------------------------------------------*/
- movq mm0, C(4) /* mm0 = xC4s4 */
- movq mm2, mm1 /* mm2 = is12 - is56 */
-
- movq mm3, mm1 /* mm3 = is12 - is56 */
- pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-
- psrlw mm2, 15
- paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
-
- paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
- movq mm2, mm7 /* mm2 = id12 + id56 */
-
- movq mm3, mm7 /* mm3 = id12 + id56 */
- pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-
- psrlw mm2, 15 /* For trucation */
- paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
- paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
- /*---------------------------------------------------------*/
- pxor mm0, mm0 /* Clear mm0 */
- psubsw mm0, mm6 /* mm0 = - id34 */
- psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
- paddsw mm6, mm6 /* mm6 = id34 * 2 */
- paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
- psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
- paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
- paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
- /*---------------------------------------------------------*/
- /* op1 and op7
- /*---------------------------------------------------------*/
- movq mm7, C(1) /* xC1S7 */
- movq mm2, mm1 /* mm2 = irot_input_x */
-
- movq mm3, mm1; /* mm3 = irot_input_x */
- pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
- paddw mm1, mm2 /* Trucated */
-
- pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
- paddw mm3, mm2 /* Truncated */
-
- movq mm5, mm0 /* mm5 = irot_input_y */
- movq mm2, mm0 /* mm2 = irot_input_y */
-
- movq mm7, C(1) /* xC1S7 */
- pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
- paddw mm0, mm2 /* Truncated */
-
- pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
- paddw mm5, mm2 /* Truncated */
-
- psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
- paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
-
- movq OL(1), mm1
- movq OH(3), mm3
- /*---------------------------------------------------------*/
- /* op3 and op5
- /*---------------------------------------------------------*/
- movq mm0, C(3) /* xC3S5 */
- movq mm1, C(5) /* xC5S3 */
- movq mm5,mm6 /* irot_input_x */
- movq mm7,mm6 /* irot_input_x */
- movq mm2,mm4 /* irot_input_y */
- movq mm3,mm4 /* irot_input_y */
- pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
- psrlw mm2,15 /* for trucation */
- psrlw mm5,15 /* for trucation */
- paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
- paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
- paddw mm4,mm2 /* Truncated */
- paddw mm6,mm5 /* Truncated */
- psubsw mm4,mm6 /* op [3] */
- movq OL(3),mm4 /* Save Op[3] */
- movq mm4,mm3 /* irot_input_y */
- movq mm6,mm7 /* irot_input_x */
- pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- paddw mm4,mm2 /* Trucated */
- paddw mm6,mm5 /* Trucated */
- paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
- paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
- paddw mm3,mm7 /* Op[5] */
- movq OH(1),mm3 /* Save Op[5] */
- /*---------------------------------------------------------*/
- /* End of Horizontal FDCT */
- /*---------------------------------------------------------*/
- lea eax, [ebx-64]
- lea esi, [edi-64]
- #undef IL
- #undef IH
- #undef OL
- #undef OH
- #define IL(i) [eax + 16 * i]
- #define IH(i) [ebx + 16 * i]
- #define OL(i) [eax + 16 * i]
- #define OH(i) [ebx + 16 * i]
- /******************************************************/
- /* Do 4x8 Transpose is done through 2 4x4 Transpose */
- /******************************************************/
- movq mm4, IH(0) /* mm4=e3e2e1e0 */
- movq mm0, IH(1) /* mm4=f3f2f1f0 */
-
- movq mm5, mm4 /* make a copy */
- punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
-
- punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
- movq mm6, IH(2) /* mm6=g3g2g1g0 */
- movq mm0, IH(3) /* mm0=h3h2h1h0 */
- movq mm7, mm6 /* mm7=g3g2g1g0 */
-
- punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
- punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
-
- movq mm3, mm4 /* mm4=f1e1f0e0 */
- punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
-
- punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
- movq mm6, mm5 /* mm5=f3e3f2e2 */
- punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
- movq IH(0), mm4 /* saveh0g0f0e0 */
-
- punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
- movq IH(2), mm5 /* saveh2g2f2e2 */
- movq IH(3), mm6 /* saveh3g3f3e3 */
- /*----------------------------------------------------*/
- /* mm3 in use for IH(1) */
- /*----------------------------------------------------*/
- movq mm4, IL(0) /* mm4=a3a2a1a0 */
- movq mm0, IL(1) /* mm0=b3b2b1b0 */
-
- movq mm5, mm4 /* mm5=a3a2a1a0 */
- punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
-
- punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
- movq mm6, IL(2) /* mm6=c3c2c1c0 */
-
- movq mm0, IL(3) /* mm0=d3d2d1d0 */
- movq mm7, mm6 /* mm7=c3c2c1c0 */
- punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
- punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
-
- movq mm1, mm4 /* mm4=b1a1b0a0 */
- punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
-
- punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
- movq mm2, mm5 /* mm5=b3a3b2a2 */
- punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
- punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
-
- movq IL(2), mm5 /* saved2c2b2a2 */
- /*----------------------------------------------------*/
- /* mm1 in use for IL(1) */
- /* mm2 in use for IL(3) */
- /* mm3 in use for IH(1) */
- /* mm4 in use for IH(0) */
- /*----------------------------------------------------*/
- /******************************************************/
- /* Let's do the 4x8 forward DCT */
- /******************************************************/
- movq mm0, mm4 /* mm4 = ip0 */
- movq mm5, mm1 /* mm5 = ip1 */
-
- movq mm6, mm2 /* mm6 = ip3 */
- movq mm7, mm3 /* mm7 = ip5 */
- paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
- paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
- paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
- paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
- psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
- psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
- psubsw mm0, mm2 /* mm0 = is07 - is34 */
- paddsw mm2, mm2 /* mm2 = is34 * 2 */
-
- psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
- paddsw mm2, mm0 /* mm2 = is07 + is34 */
- psubsw mm1, mm3 /* mm1 = is12 - is56 */
- movq TIRY, mm0 /* save is07-is34 */
- paddsw mm3, mm3 /* mm3 = is56 * 2 */
- paddsw mm3, mm1 /* mm3 = is12 + is56 */
-
- psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
- psubsw mm5, mm7 /* mm5 = id12 - id56 */
-
- paddsw mm7, mm7 /* mm7 = id56 * 2 */
- paddsw mm7, mm5 /* mm7 = id12 + id56 */
- /*---------------------------------------------------------*/
- /* op0 and op4
- /*---------------------------------------------------------*/
- psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
- paddsw mm3, mm3 /* mm3 = is1256 * 2 */
- movq mm0, mm2 /* mm0 = is0734 - is1256 */
- paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
- pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
-
- movq mm2, mm0
- psrlw mm0, 15
-
- paddw mm0, mm2
- psraw mm0, 1
- movq OH(0), mm0 /* op4, now mm0,mm2 are free */
- movq mm2, mm3 /* mm2 = is0734 + is1256 */
-
- movq mm0, mm3 /* mm0 = is0734 + is1256 */
- pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
-
- paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
- movq mm2, mm3
- psrlw mm3, 15
- paddw mm3, mm2
-
- psraw mm3, 1
- movq OL(0), mm3 /* save op0 */
- /*---------------------------------------------------------*/
- /* op2 and op6
- /*---------------------------------------------------------*/
- movq mm3, TIRY /* mm3 = irot_input_y */
- pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
-
- movq mm2, TIRY /* mm2 = irot_input_y */
- movq mm0, mm2 /* mm0 = irot_input_y */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
-
- paddw mm3, mm2 /* Truncated */
- movq mm0, mm5 /* mm0 = id12 - id56 */
-
-
- movq mm2, mm5 /* mm2 = id12 - id56 */
- pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
-
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncated */
-
- paddsw mm3, mm0 /* op[2] */
- movq mm0, mm3
- psrlw mm3, 15
- paddw mm3, mm0
-
- psraw mm3, 1
- movq OL(2), mm3 /* save op[2] */
-
- movq mm0, mm5 /* mm0 = id12 - id56 */
- movq mm2, mm5 /* mm0 = id12 - id56 */
-
- pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw mm2, 15
-
- movq mm3, TIRY /* mm3 = irot_input_y */
- paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
-
- paddw mm5, mm2 /* Truncated */
- movq mm2, mm3 /* mm2 = irot_input_y */
-
- pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
- psrlw mm2, 15
-
- paddw mm3, mm2 /* Truncated */
- psubsw mm3, mm5 /* mm3 = op[6] */
- movq mm5, mm3
- psrlw mm3, 15
-
- paddw mm3, mm5
- psraw mm3, 1
- movq OH(2), mm3
- /*-----------------------------------------------------------------------*/
- /* icommon_product1, icommon_product2 */
- /*-----------------------------------------------------------------------*/
- movq mm0, C(4) /* mm0 = xC4s4 */
- movq mm2, mm1 /* mm2 = is12 - is56 */
-
- movq mm3, mm1 /* mm3 = is12 - is56 */
- pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-
- psrlw mm2, 15
- paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
-
- paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
- movq mm2, mm7 /* mm2 = id12 + id56 */
-
- movq mm3, mm7 /* mm3 = id12 + id56 */
- pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-
- psrlw mm2, 15 /* For trucation */
- paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
- paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
- /*---------------------------------------------------------*/
- pxor mm0, mm0 /* Clear mm0 */
- psubsw mm0, mm6 /* mm0 = - id34 */
- psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
- paddsw mm6, mm6 /* mm6 = id34 * 2 */
- paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
- psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
- paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
- paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
- /*---------------------------------------------------------*/
- /* op1 and op7
- /*---------------------------------------------------------*/
- movq mm7, C(1) /* xC1S7 */
- movq mm2, mm1 /* mm2 = irot_input_x */
-
- movq mm3, mm1; /* mm3 = irot_input_x */
- pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
- paddw mm1, mm2 /* Trucated */
-
- pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
- paddw mm3, mm2 /* Truncated */
-
- movq mm5, mm0 /* mm5 = irot_input_y */
- movq mm2, mm0 /* mm2 = irot_input_y */
-
- movq mm7, C(1) /* xC1S7 */
- pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
- paddw mm0, mm2 /* Truncated */
-
- pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
- paddw mm5, mm2 /* Truncated */
-
- psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
- paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
- movq mm5, mm1
- movq mm0, mm3
- psrlw mm1, 15
- psrlw mm3, 15
- paddw mm1, mm5
- paddw mm3, mm0
- psraw mm1, 1
- psraw mm3, 1
-
- movq OL(1), mm1
- movq OH(3), mm3
- /*---------------------------------------------------------*/
- /* op3 and op5
- /*---------------------------------------------------------*/
- movq mm0, C(3) /* xC3S5 */
- movq mm1, C(5) /* xC5S3 */
- movq mm5,mm6 /* irot_input_x */
- movq mm7,mm6 /* irot_input_x */
- movq mm2,mm4 /* irot_input_y */
- movq mm3,mm4 /* irot_input_y */
- pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
- psrlw mm2,15 /* for trucation */
- psrlw mm5,15 /* for trucation */
- paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
- paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
- paddw mm4,mm2 /* Truncated */
- paddw mm6,mm5 /* Truncated */
- psubsw mm4,mm6 /* op [3] */
- movq mm6,mm4
- psrlw mm4,15
- paddw mm4,mm6
- psraw mm4,1
- movq OL(3),mm4 /* Save Op[3] */
- movq mm4,mm3 /* irot_input_y */
- movq mm6,mm7 /* irot_input_x */
- pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- paddw mm4,mm2 /* Trucated */
- paddw mm6,mm5 /* Trucated */
- paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
- paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
- paddw mm3,mm7 /* Op[5] */
- movq mm7,mm3
- psrlw mm3,15
- paddw mm3,mm7
- psraw mm3,1
- movq OH(1),mm3 /* Save Op[5] */
- /*---------------------------------------------------------*/
- /* End of 4x8 1-D FDCT */
- /*---------------------------------------------------------*/
- lea eax, [eax+8]
- lea ebx, [ebx+8]
- /******************************************************/
- /* Do 4x8 Transpose is done through 2 4x4 Transpose */
- /******************************************************/
- movq mm4, IH(0) /* mm4=e3e2e1e0 */
- movq mm0, IH(1) /* mm4=f3f2f1f0 */
-
- movq mm5, mm4 /* make a copy */
- punpcklwd mm4, mm0 /* mm4=f1e1f0e0 */
-
- punpckhwd mm5, mm0 /* mm5=f3e3f2e2 */
- movq mm6, IH(2) /* mm6=g3g2g1g0 */
- movq mm0, IH(3) /* mm0=h3h2h1h0 */
- movq mm7, mm6 /* mm7=g3g2g1g0 */
-
- punpcklwd mm6, mm0 /* mm6=h1g1h0g0 */
- punpckhwd mm7, mm0 /* mm7=h3g3h2g2 */
-
- movq mm3, mm4 /* mm4=f1e1f0e0 */
- punpckldq mm4, mm6 /* mm4=h0g0f0e0 */
-
- punpckhdq mm3, mm6 /* mm3=h1g1f1e1 */
- movq mm6, mm5 /* mm5=f3e3f2e2 */
- punpckldq mm5, mm7 /* mm5=h2g2f2e2 */
- movq IH(0), mm4 /* saveh0g0f0e0 */
-
- punpckhdq mm6, mm7 /* mm6=h3g3f3e3 */
- movq IH(2), mm5 /* saveh2g2f2e2 */
- movq IH(3), mm6 /* saveh3g3f3e3 */
- /*----------------------------------------------------*/
- /* mm3 in use for IH(1) */
- /*----------------------------------------------------*/
- movq mm4, IL(0) /* mm4=a3a2a1a0 */
- movq mm0, IL(1) /* mm0=b3b2b1b0 */
-
- movq mm5, mm4 /* mm5=a3a2a1a0 */
- punpcklwd mm4, mm0 /* mm4=b1a1b0a0 */
-
- punpckhwd mm5, mm0 /* mm5=b3a3b2a2 */
- movq mm6, IL(2) /* mm6=c3c2c1c0 */
-
- movq mm0, IL(3) /* mm0=d3d2d1d0 */
- movq mm7, mm6 /* mm7=c3c2c1c0 */
- punpcklwd mm6, mm0 /* mm6=d1c1d0c0 */
- punpckhwd mm7, mm0 /* mm7=c3c3d2c2 */
-
- movq mm1, mm4 /* mm4=b1a1b0a0 */
- punpckldq mm4, mm6 /* mm4=d0c0b0a0 */
-
- punpckhdq mm1, mm6 /* mm1=d1c1b1a1 */
- movq mm2, mm5 /* mm5=b3a3b2a2 */
- punpckldq mm5, mm7 /* mm5=d2c2b2a2 */
- punpckhdq mm2, mm7 /* mm6=d3c3b3a3 */
-
- movq IL(2), mm5 /* saved2c2b2a2 */
- /*----------------------------------------------------*/
- /* mm1 in use for IL(1) */
- /* mm2 in use for IL(3) */
- /* mm3 in use for IH(1) */
- /* mm4 in use for IH(0) */
- /*----------------------------------------------------*/
- /******************************************************/
- /* Let's do the 4x8 forward DCT */
- /******************************************************/
- movq mm0, mm4 /* mm4 = ip0 */
- movq mm5, mm1 /* mm5 = ip1 */
-
- movq mm6, mm2 /* mm6 = ip3 */
- movq mm7, mm3 /* mm7 = ip5 */
- paddsw mm0, IH(3) /* mm0 = ip0 + ip7 */
- paddsw mm1, IL(2) /* mm1 = ip1 + ip2 */
- paddsw mm2, IH(0) /* mm2 = ip3 + ip4 */
- paddsw mm3, IH(2) /* mm3 = ip5 + ip6 */
- psubsw mm4, IH(3) /* mm4 = ip0 - ip7 */
- psubsw mm5, IL(2) /* mm5 = ip1 - ip2 */
- psubsw mm0, mm2 /* mm0 = is07 - is34 */
- paddsw mm2, mm2 /* mm2 = is34 * 2 */
-
- psubsw mm6, IH(0) /* mm6 = ip3 - ip4 */
- paddsw mm2, mm0 /* mm2 = is07 + is34 */
- psubsw mm1, mm3 /* mm1 = is12 - is56 */
- movq TIRY, mm0 /* save is07-is34 */
- paddsw mm3, mm3 /* mm3 = is56 * 2 */
- paddsw mm3, mm1 /* mm3 = is12 + is56 */
-
- psubsw mm7, IH(2) /* mm7 = ip5 -ip6 */
- psubsw mm5, mm7 /* mm5 = id12 - id56 */
-
- paddsw mm7, mm7 /* mm7 = id56 * 2 */
- paddsw mm7, mm5 /* mm7 = id12 + id56 */
- /*---------------------------------------------------------*/
- /* op0 and op4
- /*---------------------------------------------------------*/
- psubsw mm2, mm3 /* mm2 = is0734 - is1256 */
- paddsw mm3, mm3 /* mm3 = is1256 * 2 */
- movq mm0, mm2 /* mm0 = is0734 - is1256 */
- paddsw mm3, mm2 /* mm3 = is0734 + is1256 */
- pmulhw mm0, C(4) /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw mm0, mm2 /* mm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncate mm0, now it is op[4] */
-
- movq mm2, mm0
- psrlw mm0, 15
-
- paddw mm0, mm2
- psraw mm0, 1
- movq OH(0), mm0 /* op4, now mm0,mm2 are free */
- movq mm2, mm3 /* mm2 = is0734 + is1256 */
-
- movq mm0, mm3 /* mm0 = is0734 + is1256 */
- pmulhw mm3, C(4) /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC4S4 * ( is0734 +is1256 ) */
-
- paddw mm3, mm2 /* Truncate mm3, now it is op[0] */
- movq mm2, mm3
- psrlw mm3, 15
- paddw mm3, mm2
-
- psraw mm3, 1
- movq OL(0), mm3 /* save op0 */
- /*---------------------------------------------------------*/
- /* op2 and op6
- /*---------------------------------------------------------*/
- movq mm3, TIRY /* mm3 = irot_input_y */
- pmulhw mm3, C(2) /* mm3 = xC2S6 * irot_input_y - irot_input_y */
-
- movq mm2, TIRY /* mm2 = irot_input_y */
- movq mm0, mm2 /* mm0 = irot_input_y */
-
- psrlw mm2, 15
- paddw mm3, mm0 /* mm3 = xC2S6 * irot_input_y */
-
- paddw mm3, mm2 /* Truncated */
- movq mm0, mm5 /* mm0 = id12 - id56 */
-
-
- movq mm2, mm5 /* mm2 = id12 - id56 */
- pmulhw mm0, C(6) /* mm0 = xC6S2 * irot_input_x */
-
- psrlw mm2, 15
- paddw mm0, mm2 /* Truncated */
-
- paddsw mm3, mm0 /* op[2] */
- movq mm0, mm3
- psrlw mm3, 15
- paddw mm3, mm0
-
- psraw mm3, 1
- movq OL(2), mm3 /* save op[2] */
-
- movq mm0, mm5 /* mm0 = id12 - id56 */
- movq mm2, mm5 /* mm0 = id12 - id56 */
-
- pmulhw mm5, C(2) /* mm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw mm2, 15
-
- movq mm3, TIRY /* mm3 = irot_input_y */
- paddw mm5, mm0 /* mm5 = xC2S6 * irot_input_x */
-
- paddw mm5, mm2 /* Truncated */
- movq mm2, mm3 /* mm2 = irot_input_y */
-
- pmulhw mm3, C(6) /* mm3 = xC6S2 * irot_input_y */
- psrlw mm2, 15
-
- paddw mm3, mm2 /* Truncated */
- psubsw mm3, mm5 /* mm3 = op[6] */
- movq mm5, mm3
- psrlw mm3, 15
-
- paddw mm3, mm5
- psraw mm3, 1
- movq OH(2), mm3
- /*-----------------------------------------------------------------------*/
- /* icommon_product1, icommon_product2 */
- /*-----------------------------------------------------------------------*/
- movq mm0, C(4) /* mm0 = xC4s4 */
- movq mm2, mm1 /* mm2 = is12 - is56 */
-
- movq mm3, mm1 /* mm3 = is12 - is56 */
- pmulhw mm1, mm0 /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-
- psrlw mm2, 15
- paddw mm1, mm3 /* mm1 = xC4S4 * ( is12 - is56 ) */
-
- paddw mm1, mm2 /* Truncate mm1, now it is icommon_product1 */
- movq mm2, mm7 /* mm2 = id12 + id56 */
-
- movq mm3, mm7 /* mm3 = id12 + id56 */
- pmulhw mm7, mm0 /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-
- psrlw mm2, 15 /* For trucation */
- paddw mm7, mm3 /* mm7 = xC4S4 * ( id12 + id56 ) */
- paddw mm7, mm2 /* Truncate mm7, now it is icommon_product2 */
- /*---------------------------------------------------------*/
- pxor mm0, mm0 /* Clear mm0 */
- psubsw mm0, mm6 /* mm0 = - id34 */
- psubsw mm0, mm7 /* mm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
- paddsw mm6, mm6 /* mm6 = id34 * 2 */
- paddsw mm6, mm0 /* mm6 = id34 - icommon_product2 = irot_input_x for 35 */
- psubsw mm4, mm1 /* mm4 = id07 - icommon_product1 = irot_input_x for 35*/
- paddsw mm1, mm1 /* mm1 = icommon_product1 * 2 */
- paddsw mm1, mm4 /* mm1 = id07 + icommon_product1 = irot_input_x for 17*/
- /*---------------------------------------------------------*/
- /* op1 and op7
- /*---------------------------------------------------------*/
- movq mm7, C(1) /* xC1S7 */
- movq mm2, mm1 /* mm2 = irot_input_x */
-
- movq mm3, mm1; /* mm3 = irot_input_x */
- pmulhw mm1, mm7 /* mm1 = xC1S7 * irot_input_x - irot_input_x */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm1, mm3 /* mm1 = xC1S7 * irot_input_x */
- paddw mm1, mm2 /* Trucated */
-
- pmulhw mm3, mm7 /* mm3 = xC7S1 * irot_input_x */
- paddw mm3, mm2 /* Truncated */
-
- movq mm5, mm0 /* mm5 = irot_input_y */
- movq mm2, mm0 /* mm2 = irot_input_y */
-
- movq mm7, C(1) /* xC1S7 */
- pmulhw mm0, mm7 /* mm0 = xC1S7 * irot_input_y - irot_input_y */
-
- movq mm7, C(7) /* xC7S1 */
- psrlw mm2, 15 /* for trucation */
-
- paddw mm0, mm5 /* mm0 = xC1S7 * irot_input_y */
- paddw mm0, mm2 /* Truncated */
-
- pmulhw mm5, mm7 /* mm5 = xC7S1 * irot_input_y */
- paddw mm5, mm2 /* Truncated */
-
- psubsw mm1, mm5 /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
- paddsw mm3, mm0 /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
- movq mm5, mm1
- movq mm0, mm3
- psrlw mm1, 15
- psrlw mm3, 15
- paddw mm1, mm5
- paddw mm3, mm0
- psraw mm1, 1
- psraw mm3, 1
-
- movq OL(1), mm1
- movq OH(3), mm3
- /*---------------------------------------------------------*/
- /* op3 and op5
- /*---------------------------------------------------------*/
- movq mm0, C(3) /* xC3S5 */
- movq mm1, C(5) /* xC5S3 */
- movq mm5,mm6 /* irot_input_x */
- movq mm7,mm6 /* irot_input_x */
- movq mm2,mm4 /* irot_input_y */
- movq mm3,mm4 /* irot_input_y */
- pmulhw mm4,mm0 /* mm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw mm6,mm1 /* mm6 = xC5S3 * irot_input_y - irot_input_y */
- psrlw mm2,15 /* for trucation */
- psrlw mm5,15 /* for trucation */
- paddw mm4,mm3 /* mm4 = xC3S5 * irot_input_x */
- paddw mm6,mm7 /* mm6 = xC5S3 * irot_input_y */
- paddw mm4,mm2 /* Truncated */
- paddw mm6,mm5 /* Truncated */
- psubsw mm4,mm6 /* op [3] */
- movq mm6,mm4
- psrlw mm4,15
- paddw mm4,mm6
- psraw mm4,1
- movq OL(3),mm4 /* Save Op[3] */
- movq mm4,mm3 /* irot_input_y */
- movq mm6,mm7 /* irot_input_x */
- pmulhw mm3,mm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw mm7,mm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- paddw mm4,mm2 /* Trucated */
- paddw mm6,mm5 /* Trucated */
- paddw mm3,mm4 /* mm3 = xC5S3 * irot_input_x */
- paddw mm7,mm6 /* mm7 = xC3S5 * irot_input_y */
- paddw mm3,mm7 /* Op[5] */
- movq mm7,mm3
- psrlw mm3,15
- paddw mm3,mm7
- psraw mm3,1
- movq OH(1),mm3 /* Save Op[5] */
- /*---------------------------------------------------------*/
- /* End of 4x8 1-D FDCT */
- /*---------------------------------------------------------*/
- }/* end of _asm code section */
- }
|