123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748 |
- /****************************************************************************
- *
- * Module Title : DeRingingWmtOpt.c
- *
- * Description : Optimized functions for PostProcessor
- *
- ***************************************************************************/
- #define STRICT /* Strict type checking */
- /****************************************************************************
- * Header Files
- ****************************************************************************/
- #include "postp.h"
- /****************************************************************************
- * MAcros
- ****************************************************************************/
- #pragma warning(disable:4305)
- #pragma warning(disable:4731)
- /****************************************************************************
- * Module Statics
- ****************************************************************************/
- #if defined(_WIN32_WCE)
- #else
- __declspec(align(16)) static unsigned short eight128s []= { 128, 128, 128, 128, 128, 128, 128, 128};
- __declspec(align(16)) static unsigned short eight64s[] = { 64, 64, 64, 64, 64, 64, 64, 64};
- __declspec(align(16)) static char eight64c [] = { 64, 64, 64,64,64,64,64,64};
- __declspec(align(16)) static char eight32c [] = { 32,32,32,32,32,32,32,32};
- __declspec(align(16)) static char eight127c []= { 127, 127, 127, 127, 127, 127, 127, 127};
- __declspec(align(16)) static char eight128c []= { 128, 128, 128, 128, 128, 128, 128, 128};
- __declspec(align(16)) static unsigned char eight223c[] = { 223,223,223,223,223,223,223,223};
- __declspec(align(16)) static unsigned char eight231c[] = { 231,231,231,231,231,231,231,231};
- #endif
- /****************************************************************************
- * Imports
- ****************************************************************************/
- extern UINT32 SharpenModifier[];
- /****************************************************************************
- *
- * ROUTINE : DeRingBlockStrong_WMT
- *
- * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance.
- * const UINT8 *SrcPtr : Pointer to input image.
- * UINT8 *DstPtr : Pointer to output image.
- * const INT32 Pitch : Image stride.
- * UINT32 FragQIndex : Q-index block encoded with.
- * UINT32 *QuantScale : Array of quantization scale factors.
- *
- * OUTPUTS : None.
- *
- * RETURNS : void
- *
- * FUNCTION : Filtering a block for de-ringing purpose.
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
- void DeringBlockStrong_WMT
- (
- const POSTPROC_INSTANCE *pbi,
- const UINT8 *SrcPtr,
- UINT8 *DstPtr,
- const INT32 Pitch,
- UINT32 FragQIndex,
- UINT32 *QuantScale
- )
- {
- #if defined(_WIN32_WCE)
- return;
- #else
- __declspec(align(16)) short UDMod[72];
- __declspec(align(16)) short LRMod[128];
- unsigned int PlaneLineStep = Pitch;
- const unsigned char *Src = SrcPtr;
- unsigned char *Des = DstPtr;
-
- short *UDPointer = UDMod;
- short *LRPointer = LRMod;
-
- UINT32 QStep = QuantScale[FragQIndex];
- INT32 Sharpen = SharpenModifier[FragQIndex];
- (void) pbi;
- __asm
- {
- push esi
- push edi
-
- mov esi, Src /* Source Pointer */
- mov edi, UDPointer /* UD modifier pointer */
- push ecx
- push edx
- mov ecx, PlaneLineStep /* Pitch Step */
- xor edx, edx
- push eax
- push ebx
- mov eax, QStep /* QValue */
- mov ebx, Sharpen /* Sharpen */
- movd mm0, eax /* QValue */
- movd mm2, ebx /* sharpen */
- push ebp
- punpcklbw mm0, mm0 /* 00 00 00 QQ */
- sub edx, ecx /* Negative Pitch */
- punpcklbw mm2, mm2 /* 00 00 00 SS */
- pxor mm7, mm7 /* clear mm7 for unpacks */
- punpcklbw mm0, mm0 /* 00 00 qq qq */
- mov eax, LRPointer /* Left and Right Modifier */
- punpcklbw mm2, mm2 /* 00 00 ss ss */
- lea ebx, [esi+ecx*8] /* Source Pointer of last row */
- punpcklbw mm0, mm0 /* qq qq qq qq */
- movq mm1, mm0; /* make a copy */
-
- punpcklbw mm2, mm2 /* ss ss ss ss */
- paddb mm1, mm0 /* QValue * 2 */
- paddb mm1, mm0 /* High = 3 * Qvalue */
- paddusb mm1, eight223c /* clamping high to 32 */
- paddb mm0, eight32c /* 32+QValues */
- psubusb mm1, eight223c /* Get the real value back */
- movq mm3, eight127c /* 7f 7f 7f 7f 7f 7f 7f 7f */
- pandn mm1, mm3 /* ClampHigh */
- /* mm0,mm1,mm2,mm7 are in use */
- /* mm0---> QValue+32 */
- /* mm1---> ClampHigh */
- /* mm2---> Sharpen */
- /* mm7---> Cleared for unpack */
- FillModLoop1:
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
- movq mm5, mm3 /* make a copy of p */
- psubusb mm3, mm4 /* p-pu */
- psubusb mm4, mm5 /* pu-p */
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- movq2dq xmm0, mm6
- movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */
- punpcklbw xmm7, xmm0 /* extended to words */
- movq mm5, mm3 /* make a copy of p */
- psraw xmm7, 8 /* sign extended */
- psubusb mm3, mm4 /* p-pu */
- movdqa [edi], xmm7 /* writeout UDmod*/
- psubusb mm4, mm5 /* pu-p */
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */
- movq2dq xmm0, mm6
- movq mm5, mm3 /* make a copy of p */
- punpcklbw xmm7, xmm0 /* extened to shorts */
- psubusb mm3, mm4 /* p-pu */
- psraw xmm7, 8 /* sign extended */
- psubusb mm4, mm5 /* pu-p */
- movdqa [eax], xmm7 /* writeout UDmod*/
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- add esi, ecx
-
- movq2dq xmm0, mm6
- add edi, 16
- punpcklbw xmm7, mm0 /* extended to shorts */
- add eax, 16
- psraw xmm7, 8 /* sign extended */
- cmp esi, ebx
- movdqa [eax+112], xmm7 /* writeout UDmod*/
- jne FillModLoop1
-
- /* last UDMod */
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
- movq mm5, mm3 /* make a copy of p */
-
- psubusb mm3, mm4 /* p-pu */
- psubusb mm4, mm5 /* pu-p */
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- movq2dq xmm6, mm6
- punpcklbw xmm7, xmm6 /* 03 xx 02 xx 01 xx 00 xx */
- psraw xmm7, 8 /* sign extended */
- movdqa [edi], xmm7 /* writeout UDmod */
- mov esi, Src
- mov edi, Des
-
- mov eax, UDPointer
- mov ebx, LRPointer
- mov ebp, 8
- FilterLoop1:
- movq xmm0, QWORD PTR [esi+edx] /* mm0 = Pixels above */
- pxor xmm7, xmm7 /* clear mm7 */
- movdqa xmm4, [eax] /* au */
- punpcklbw xmm0, xmm7 /* extended to shorts */
-
- movq xmm2, QWORD PTR [esi+ecx] /* mm2 = pixels below */
- pmullw xmm0, xmm4 /* pu*au */
-
- movdqa xmm6, [eax+16] /* ad */
- punpcklbw xmm2, xmm7 /* extened to shorts*/
-
- movq xmm1, QWORD PTR [esi-1] /* pixel to the left */
- pmullw xmm2, xmm6 /* ad*pd */
-
- movdqa xmm3, [ebx] /* al */
- punpcklbw xmm1, xmm7 /* extended to shorts */
- movq xmm5, QWORD PTR [esi+1] /* pixel to the right */
- pmullw xmm1, xmm3 /* al * pl */
- paddw xmm4, xmm6 /* au+ad */
- punpcklbw xmm5, xmm7 /* extends to shorts */
-
- movdqa xmm6, [ebx+128] /* ar */
- pmullw xmm5, xmm6 /* ar * pr */
-
- paddw xmm0, xmm2 /* au*pu + ad*pd */
- paddw xmm4, xmm3 /* au+ad+al */
- paddw xmm0, xmm1 /* au*pu+ad*pd+al*pl */
- paddw xmm4, xmm6 /* au+ad+al+ar */
- movq xmm2, QWORD PTR [esi] /* p */
- paddw xmm0, xmm5 /* au*pu+ad*pd+al*pl+ar*pr */
-
- /* xmm0 --- au*pu+ad*pd+al*pl+ar*pr */
- /* xmm4 --- au + ad + al + ar */
-
- movdqa xmm1, eight128s /* 0080 0080 0080 0080 0080 0080 0080 0080 */
- punpcklbw xmm2, xmm7 /* extended to shorts */
- psubw xmm1, xmm4 /* 128-(au+ad+al+ar) */
- pmullw xmm2, xmm1 /* p*(128-(au+ad+al+ar)) */
-
- add esi, ecx /* Src += Pitch */
- movdqa xmm6, eight64s /* 64, 64, 64, 64, 64, 64, 64, 64 */
- movdqa xmm7, xmm6 /* 64, 64, 64, 64, 64, 64, 64, 64 */
- add eax, 16 /* UDPointer += 8 */
- psllw xmm7, 8 /* {16384, .. } */
- paddw xmm0, xmm2 /* sum */
- add edi, ecx /* Des += Pitch */
- paddw xmm0, xmm6 /* sum+B */
- add ebx, 16 /* LPointer +=8 */
- paddw xmm0, xmm7 /* clamping */
- psubusw xmm0, xmm7 /* clamping */
- dec ebp
- psrlw xmm0, 7 /* (sum+B)>>7 */
- packuswb xmm0, xmm7 /* pack to 8 bytes */
- movq QWORD PTR [edi+edx], xmm0 /* write to destination */
- jnz FilterLoop1
-
- pop ebp
- pop ebx
- pop eax
- pop edx
- pop ecx
- pop edi
- pop esi
- }
- #endif
- }
- /****************************************************************************
- *
- * ROUTINE : DeRingBlockWeak_WMT
- *
- * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance.
- * const UINT8 *SrcPtr : Pointer to input image.
- * UINT8 *DstPtr : Pointer to output image.
- * const INT32 Pitch : Image stride.
- * UINT32 FragQIndex : Q-index block encoded with.
- * UINT32 *QuantScale : Array of quantization scale factors.
- *
- * OUTPUTS : None.
- *
- * RETURNS : void
- *
- * FUNCTION : Filtering a block for de-ringing purpose.
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
- void DeringBlockWeak_WMT
- (
- const POSTPROC_INSTANCE *pbi,
- const UINT8 *SrcPtr,
- UINT8 *DstPtr,
- const INT32 Pitch,
- UINT32 FragQIndex,
- UINT32 *QuantScale
- )
- {
- #if defined(_WIN32_WCE)
- return;
- #else
- __declspec(align(16)) short UDMod[72];
- __declspec(align(16)) short LRMod[128];
-
- unsigned int PlaneLineStep = Pitch;
- const unsigned char *Src = SrcPtr;
- unsigned char *Des = DstPtr;
-
- short *UDPointer = UDMod;
- short *LRPointer = LRMod;
-
- UINT32 QStep = QuantScale[FragQIndex];
- INT32 Sharpen = SharpenModifier[FragQIndex];
- (void) pbi;
- __asm
- {
- push esi
- push edi
-
- mov esi, Src /* Source Pointer */
- mov edi, UDPointer /* UD modifier pointer */
- push ecx
- push edx
- mov ecx, PlaneLineStep /* Pitch Step */
- xor edx, edx
- push eax
- push ebx
- mov eax, QStep /* QValue */
- mov ebx, Sharpen /* Sharpen */
- movd mm0, eax /* QValue */
- movd mm2, ebx /* sharpen */
- push ebp
- punpcklbw mm0, mm0 /* 00 00 00 QQ */
- sub edx, ecx /* Negative Pitch */
- punpcklbw mm2, mm2 /* 00 00 00 SS */
- pxor mm7, mm7 /* clear mm7 for unpacks */
- punpcklbw mm0, mm0 /* 00 00 qq qq */
- mov eax, LRPointer /* Left and Right Modifier */
- punpcklbw mm2, mm2 /* 00 00 ss ss */
- lea ebx, [esi+ecx*8] /* Source Pointer of last row */
- punpcklbw mm0, mm0 /* qq qq qq qq */
- movq mm1, mm0; /* make a copy */
-
- punpcklbw mm2, mm2 /* ss ss ss ss */
- paddb mm1, mm0 /* QValue * 2 */
- paddb mm1, mm0 /* High = 3 * Qvalue */
- paddusb mm1, eight231c /* clamping high to 24 */
- paddb mm0, eight32c /* 32+QValues */
- psubusb mm1, eight231c /* Get the real value back */
- movq mm3, eight127c /* 7f 7f 7f 7f 7f 7f 7f 7f */
- pandn mm1, mm3 /* ClampHigh */
- /* mm0,mm1,mm2,mm7 are in use */
- /* mm0---> QValue+32 */
- /* mm1---> ClampHigh */
- /* mm2---> Sharpen */
- /* mm7---> Cleared for unpack */
- FillModLoop1:
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
- movq mm5, mm3 /* make a copy of p */
- psubusb mm3, mm4 /* p-pu */
- psubusb mm4, mm5 /* pu-p */
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- paddusb mm3, mm3 /* 2*abs(p-pu) */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- movq2dq xmm0, mm6
- movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */
- punpcklbw xmm7, xmm0 /* extended to words */
- movq mm5, mm3 /* make a copy of p */
- psraw xmm7, 8 /* sign extended */
- psubusb mm3, mm4 /* p-pu */
- movdqa [edi], xmm7 /* writeout UDmod*/
- psubusb mm4, mm5 /* pu-p */
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- paddusb mm3, mm3 /* 2*abs(p-pu) */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */
- movq2dq xmm0, mm6
- movq mm5, mm3 /* make a copy of p */
- punpcklbw xmm7, xmm0 /* extened to shorts */
- psubusb mm3, mm4 /* p-pu */
- psraw xmm7, 8 /* sign extended */
- psubusb mm4, mm5 /* pu-p */
- movdqa [eax], xmm7 /* writeout UDmod*/
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- paddusb mm3, mm3 /* 2*abs(p-pu) */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- add esi, ecx
-
- movq2dq xmm0, mm6
- add edi, 16
- punpcklbw xmm7, mm0 /* extended to shorts */
- add eax, 16
- psraw xmm7, 8 /* sign extended */
- cmp esi, ebx
- movdqa [eax+112], xmm7 /* writeout UDmod*/
- jne FillModLoop1
-
- /* last UDMod */
- movq mm3, QWORD PTR [esi] /* read 8 pixels p */
- pxor xmm7, xmm7 /* clear xmm7 */
- movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
- movq mm5, mm3 /* make a copy of p */
-
- psubusb mm3, mm4 /* p-pu */
- psubusb mm4, mm5 /* pu-p */
- por mm3, mm4 /* abs(p-pu) */
- movq mm6, mm0 /* 32+QValues */
- paddusb mm3, mm3 /* 2*abs(p-pu) */
- movq mm4, mm0 /* 32+QValues */
- psubusb mm6, mm3 /* zero clampled TmpMod */
- movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
- paddb mm4, eight64c /* 32+QValues + 64 */
- pxor mm4, mm5 /* convert to a sign number */
- pxor mm3, mm5 /* convert to a sign number */
- pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
- pand mm3, mm2 /* use sharpen */
- paddsb mm6, mm1 /* clamping to high */
- psubsb mm6, mm1 /* offset back */
- por mm6, mm3 /* Mod value to be stored */
- movq2dq xmm6, mm6
- punpcklbw xmm7, xmm6 /* 03 xx 02 xx 01 xx 00 xx */
- psraw xmm7, 8 /* sign extended */
- movdqa [edi], xmm7 /* writeout UDmod */
- mov esi, Src
- mov edi, Des
-
- mov eax, UDPointer
- mov ebx, LRPointer
- mov ebp, 8
- FilterLoop1:
- movq xmm0, QWORD PTR [esi+edx] /* mm0 = Pixels above */
- pxor xmm7, xmm7 /* clear mm7 */
- movdqa xmm4, [eax] /* au */
- punpcklbw xmm0, xmm7 /* extended to shorts */
-
- movq xmm2, QWORD PTR [esi+ecx] /* mm2 = pixels below */
- pmullw xmm0, xmm4 /* pu*au */
-
- movdqa xmm6, [eax+16] /* ad */
- punpcklbw xmm2, xmm7 /* extened to shorts*/
-
- movq xmm1, QWORD PTR [esi-1] /* pixel to the left */
- pmullw xmm2, xmm6 /* ad*pd */
-
- movdqa xmm3, [ebx] /* al */
- punpcklbw xmm1, xmm7 /* extended to shorts */
- movq xmm5, QWORD PTR [esi+1] /* pixel to the right */
- pmullw xmm1, xmm3 /* al * pl */
- paddw xmm4, xmm6 /* au+ad */
- punpcklbw xmm5, xmm7 /* extends to shorts */
-
- movdqa xmm6, [ebx+128] /* ar */
- pmullw xmm5, xmm6 /* ar * pr */
-
- paddw xmm0, xmm2 /* au*pu + ad*pd */
- paddw xmm4, xmm3 /* au+ad+al */
- paddw xmm0, xmm1 /* au*pu+ad*pd+al*pl */
- paddw xmm4, xmm6 /* au+ad+al+ar */
- movq xmm2, QWORD PTR [esi] /* p */
- paddw xmm0, xmm5 /* au*pu+ad*pd+al*pl+ar*pr */
-
- /* xmm0 --- au*pu+ad*pd+al*pl+ar*pr */
- /* xmm4 --- au + ad + al + ar */
-
- movdqa xmm1, eight128s /* 0080 0080 0080 0080 0080 0080 0080 0080 */
- punpcklbw xmm2, xmm7 /* extended to shorts */
- psubw xmm1, xmm4 /* 128-(au+ad+al+ar) */
- pmullw xmm2, xmm1 /* p*(128-(au+ad+al+ar)) */
-
- add esi, ecx /* Src += Pitch */
- movdqa xmm6, eight64s /* 64, 64, 64, 64, 64, 64, 64, 64 */
- movdqa xmm7, xmm6 /* 64, 64, 64, 64, 64, 64, 64, 64 */
- add eax, 16 /* UDPointer += 8 */
- psllw xmm7, 8 /* {16384, .. } */
- paddw xmm0, xmm2 /* sum */
- add edi, ecx /* Des += Pitch */
- paddw xmm0, xmm6 /* sum+B */
- add ebx, 16 /* LPointer +=8 */
- paddw xmm0, xmm7 /* clamping */
- psubusw xmm0, xmm7 /* clamping */
- dec ebp
- psrlw xmm0, 7 /* (sum+B)>>7 */
- packuswb xmm0, xmm7 /* pack to 8 bytes */
- movq QWORD PTR [edi+edx], xmm0 /* write to destination */
- jnz FilterLoop1
-
- pop ebp
- pop ebx
- pop eax
- pop edx
- pop ecx
- pop edi
- pop esi
- }
- #endif
- }
|