/**************************************************************************** * * Module Title : DeRingingWmtOpt.c * * Description : Optimized functions for PostProcessor * ***************************************************************************/ #define STRICT /* Strict type checking */ /**************************************************************************** * Header Files ****************************************************************************/ #include "postp.h" /**************************************************************************** * MAcros ****************************************************************************/ #pragma warning(disable:4305) #pragma warning(disable:4731) /**************************************************************************** * Module Statics ****************************************************************************/ #if defined(_WIN32_WCE) #else __declspec(align(16)) static unsigned short eight128s []= { 128, 128, 128, 128, 128, 128, 128, 128}; __declspec(align(16)) static unsigned short eight64s[] = { 64, 64, 64, 64, 64, 64, 64, 64}; __declspec(align(16)) static char eight64c [] = { 64, 64, 64,64,64,64,64,64}; __declspec(align(16)) static char eight32c [] = { 32,32,32,32,32,32,32,32}; __declspec(align(16)) static char eight127c []= { 127, 127, 127, 127, 127, 127, 127, 127}; __declspec(align(16)) static char eight128c []= { 128, 128, 128, 128, 128, 128, 128, 128}; __declspec(align(16)) static unsigned char eight223c[] = { 223,223,223,223,223,223,223,223}; __declspec(align(16)) static unsigned char eight231c[] = { 231,231,231,231,231,231,231,231}; #endif /**************************************************************************** * Imports ****************************************************************************/ extern UINT32 SharpenModifier[]; /**************************************************************************** * * ROUTINE : DeRingBlockStrong_WMT * * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance. * const UINT8 *SrcPtr : Pointer to input image. * UINT8 *DstPtr : Pointer to output image. * const INT32 Pitch : Image stride. * UINT32 FragQIndex : Q-index block encoded with. * UINT32 *QuantScale : Array of quantization scale factors. * * OUTPUTS : None. * * RETURNS : void * * FUNCTION : Filtering a block for de-ringing purpose. * * SPECIAL NOTES : None. * ****************************************************************************/ void DeringBlockStrong_WMT ( const POSTPROC_INSTANCE *pbi, const UINT8 *SrcPtr, UINT8 *DstPtr, const INT32 Pitch, UINT32 FragQIndex, UINT32 *QuantScale ) { #if defined(_WIN32_WCE) return; #else __declspec(align(16)) short UDMod[72]; __declspec(align(16)) short LRMod[128]; unsigned int PlaneLineStep = Pitch; const unsigned char *Src = SrcPtr; unsigned char *Des = DstPtr; short *UDPointer = UDMod; short *LRPointer = LRMod; UINT32 QStep = QuantScale[FragQIndex]; INT32 Sharpen = SharpenModifier[FragQIndex]; (void) pbi; __asm { push esi push edi mov esi, Src /* Source Pointer */ mov edi, UDPointer /* UD modifier pointer */ push ecx push edx mov ecx, PlaneLineStep /* Pitch Step */ xor edx, edx push eax push ebx mov eax, QStep /* QValue */ mov ebx, Sharpen /* Sharpen */ movd mm0, eax /* QValue */ movd mm2, ebx /* sharpen */ push ebp punpcklbw mm0, mm0 /* 00 00 00 QQ */ sub edx, ecx /* Negative Pitch */ punpcklbw mm2, mm2 /* 00 00 00 SS */ pxor mm7, mm7 /* clear mm7 for unpacks */ punpcklbw mm0, mm0 /* 00 00 qq qq */ mov eax, LRPointer /* Left and Right Modifier */ punpcklbw mm2, mm2 /* 00 00 ss ss */ lea ebx, [esi+ecx*8] /* Source Pointer of last row */ punpcklbw mm0, mm0 /* qq qq qq qq */ movq mm1, mm0; /* make a copy */ punpcklbw mm2, mm2 /* ss ss ss ss */ paddb mm1, mm0 /* QValue * 2 */ paddb mm1, mm0 /* High = 3 * Qvalue */ paddusb mm1, eight223c /* clamping high to 32 */ paddb mm0, eight32c /* 32+QValues */ psubusb mm1, eight223c /* Get the real value back */ movq mm3, eight127c /* 7f 7f 7f 7f 7f 7f 7f 7f */ pandn mm1, mm3 /* ClampHigh */ /* mm0,mm1,mm2,mm7 are in use */ /* mm0---> QValue+32 */ /* mm1---> ClampHigh */ /* mm2---> Sharpen */ /* mm7---> Cleared for unpack */ FillModLoop1: movq mm3, QWORD PTR [esi] /* read 8 pixels p */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */ movq mm5, mm3 /* make a copy of p */ psubusb mm3, mm4 /* p-pu */ psubusb mm4, mm5 /* pu-p */ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ movq mm3, QWORD PTR [esi] /* read 8 pixels p */ movq2dq xmm0, mm6 movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */ punpcklbw xmm7, xmm0 /* extended to words */ movq mm5, mm3 /* make a copy of p */ psraw xmm7, 8 /* sign extended */ psubusb mm3, mm4 /* p-pu */ movdqa [edi], xmm7 /* writeout UDmod*/ psubusb mm4, mm5 /* pu-p */ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ movq mm3, QWORD PTR [esi] /* read 8 pixels p */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */ movq2dq xmm0, mm6 movq mm5, mm3 /* make a copy of p */ punpcklbw xmm7, xmm0 /* extened to shorts */ psubusb mm3, mm4 /* p-pu */ psraw xmm7, 8 /* sign extended */ psubusb mm4, mm5 /* pu-p */ movdqa [eax], xmm7 /* writeout UDmod*/ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ add esi, ecx movq2dq xmm0, mm6 add edi, 16 punpcklbw xmm7, mm0 /* extended to shorts */ add eax, 16 psraw xmm7, 8 /* sign extended */ cmp esi, ebx movdqa [eax+112], xmm7 /* writeout UDmod*/ jne FillModLoop1 /* last UDMod */ movq mm3, QWORD PTR [esi] /* read 8 pixels p */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */ movq mm5, mm3 /* make a copy of p */ psubusb mm3, mm4 /* p-pu */ psubusb mm4, mm5 /* pu-p */ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ movq2dq xmm6, mm6 punpcklbw xmm7, xmm6 /* 03 xx 02 xx 01 xx 00 xx */ psraw xmm7, 8 /* sign extended */ movdqa [edi], xmm7 /* writeout UDmod */ mov esi, Src mov edi, Des mov eax, UDPointer mov ebx, LRPointer mov ebp, 8 FilterLoop1: movq xmm0, QWORD PTR [esi+edx] /* mm0 = Pixels above */ pxor xmm7, xmm7 /* clear mm7 */ movdqa xmm4, [eax] /* au */ punpcklbw xmm0, xmm7 /* extended to shorts */ movq xmm2, QWORD PTR [esi+ecx] /* mm2 = pixels below */ pmullw xmm0, xmm4 /* pu*au */ movdqa xmm6, [eax+16] /* ad */ punpcklbw xmm2, xmm7 /* extened to shorts*/ movq xmm1, QWORD PTR [esi-1] /* pixel to the left */ pmullw xmm2, xmm6 /* ad*pd */ movdqa xmm3, [ebx] /* al */ punpcklbw xmm1, xmm7 /* extended to shorts */ movq xmm5, QWORD PTR [esi+1] /* pixel to the right */ pmullw xmm1, xmm3 /* al * pl */ paddw xmm4, xmm6 /* au+ad */ punpcklbw xmm5, xmm7 /* extends to shorts */ movdqa xmm6, [ebx+128] /* ar */ pmullw xmm5, xmm6 /* ar * pr */ paddw xmm0, xmm2 /* au*pu + ad*pd */ paddw xmm4, xmm3 /* au+ad+al */ paddw xmm0, xmm1 /* au*pu+ad*pd+al*pl */ paddw xmm4, xmm6 /* au+ad+al+ar */ movq xmm2, QWORD PTR [esi] /* p */ paddw xmm0, xmm5 /* au*pu+ad*pd+al*pl+ar*pr */ /* xmm0 --- au*pu+ad*pd+al*pl+ar*pr */ /* xmm4 --- au + ad + al + ar */ movdqa xmm1, eight128s /* 0080 0080 0080 0080 0080 0080 0080 0080 */ punpcklbw xmm2, xmm7 /* extended to shorts */ psubw xmm1, xmm4 /* 128-(au+ad+al+ar) */ pmullw xmm2, xmm1 /* p*(128-(au+ad+al+ar)) */ add esi, ecx /* Src += Pitch */ movdqa xmm6, eight64s /* 64, 64, 64, 64, 64, 64, 64, 64 */ movdqa xmm7, xmm6 /* 64, 64, 64, 64, 64, 64, 64, 64 */ add eax, 16 /* UDPointer += 8 */ psllw xmm7, 8 /* {16384, .. } */ paddw xmm0, xmm2 /* sum */ add edi, ecx /* Des += Pitch */ paddw xmm0, xmm6 /* sum+B */ add ebx, 16 /* LPointer +=8 */ paddw xmm0, xmm7 /* clamping */ psubusw xmm0, xmm7 /* clamping */ dec ebp psrlw xmm0, 7 /* (sum+B)>>7 */ packuswb xmm0, xmm7 /* pack to 8 bytes */ movq QWORD PTR [edi+edx], xmm0 /* write to destination */ jnz FilterLoop1 pop ebp pop ebx pop eax pop edx pop ecx pop edi pop esi } #endif } /**************************************************************************** * * ROUTINE : DeRingBlockWeak_WMT * * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance. * const UINT8 *SrcPtr : Pointer to input image. * UINT8 *DstPtr : Pointer to output image. * const INT32 Pitch : Image stride. * UINT32 FragQIndex : Q-index block encoded with. * UINT32 *QuantScale : Array of quantization scale factors. * * OUTPUTS : None. * * RETURNS : void * * FUNCTION : Filtering a block for de-ringing purpose. * * SPECIAL NOTES : None. * ****************************************************************************/ void DeringBlockWeak_WMT ( const POSTPROC_INSTANCE *pbi, const UINT8 *SrcPtr, UINT8 *DstPtr, const INT32 Pitch, UINT32 FragQIndex, UINT32 *QuantScale ) { #if defined(_WIN32_WCE) return; #else __declspec(align(16)) short UDMod[72]; __declspec(align(16)) short LRMod[128]; unsigned int PlaneLineStep = Pitch; const unsigned char *Src = SrcPtr; unsigned char *Des = DstPtr; short *UDPointer = UDMod; short *LRPointer = LRMod; UINT32 QStep = QuantScale[FragQIndex]; INT32 Sharpen = SharpenModifier[FragQIndex]; (void) pbi; __asm { push esi push edi mov esi, Src /* Source Pointer */ mov edi, UDPointer /* UD modifier pointer */ push ecx push edx mov ecx, PlaneLineStep /* Pitch Step */ xor edx, edx push eax push ebx mov eax, QStep /* QValue */ mov ebx, Sharpen /* Sharpen */ movd mm0, eax /* QValue */ movd mm2, ebx /* sharpen */ push ebp punpcklbw mm0, mm0 /* 00 00 00 QQ */ sub edx, ecx /* Negative Pitch */ punpcklbw mm2, mm2 /* 00 00 00 SS */ pxor mm7, mm7 /* clear mm7 for unpacks */ punpcklbw mm0, mm0 /* 00 00 qq qq */ mov eax, LRPointer /* Left and Right Modifier */ punpcklbw mm2, mm2 /* 00 00 ss ss */ lea ebx, [esi+ecx*8] /* Source Pointer of last row */ punpcklbw mm0, mm0 /* qq qq qq qq */ movq mm1, mm0; /* make a copy */ punpcklbw mm2, mm2 /* ss ss ss ss */ paddb mm1, mm0 /* QValue * 2 */ paddb mm1, mm0 /* High = 3 * Qvalue */ paddusb mm1, eight231c /* clamping high to 24 */ paddb mm0, eight32c /* 32+QValues */ psubusb mm1, eight231c /* Get the real value back */ movq mm3, eight127c /* 7f 7f 7f 7f 7f 7f 7f 7f */ pandn mm1, mm3 /* ClampHigh */ /* mm0,mm1,mm2,mm7 are in use */ /* mm0---> QValue+32 */ /* mm1---> ClampHigh */ /* mm2---> Sharpen */ /* mm7---> Cleared for unpack */ FillModLoop1: movq mm3, QWORD PTR [esi] /* read 8 pixels p */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */ movq mm5, mm3 /* make a copy of p */ psubusb mm3, mm4 /* p-pu */ psubusb mm4, mm5 /* pu-p */ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ paddusb mm3, mm3 /* 2*abs(p-pu) */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ movq mm3, QWORD PTR [esi] /* read 8 pixels p */ movq2dq xmm0, mm6 movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */ punpcklbw xmm7, xmm0 /* extended to words */ movq mm5, mm3 /* make a copy of p */ psraw xmm7, 8 /* sign extended */ psubusb mm3, mm4 /* p-pu */ movdqa [edi], xmm7 /* writeout UDmod*/ psubusb mm4, mm5 /* pu-p */ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ paddusb mm3, mm3 /* 2*abs(p-pu) */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ movq mm3, QWORD PTR [esi] /* read 8 pixels p */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */ movq2dq xmm0, mm6 movq mm5, mm3 /* make a copy of p */ punpcklbw xmm7, xmm0 /* extened to shorts */ psubusb mm3, mm4 /* p-pu */ psraw xmm7, 8 /* sign extended */ psubusb mm4, mm5 /* pu-p */ movdqa [eax], xmm7 /* writeout UDmod*/ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ paddusb mm3, mm3 /* 2*abs(p-pu) */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ add esi, ecx movq2dq xmm0, mm6 add edi, 16 punpcklbw xmm7, mm0 /* extended to shorts */ add eax, 16 psraw xmm7, 8 /* sign extended */ cmp esi, ebx movdqa [eax+112], xmm7 /* writeout UDmod*/ jne FillModLoop1 /* last UDMod */ movq mm3, QWORD PTR [esi] /* read 8 pixels p */ pxor xmm7, xmm7 /* clear xmm7 */ movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */ movq mm5, mm3 /* make a copy of p */ psubusb mm3, mm4 /* p-pu */ psubusb mm4, mm5 /* pu-p */ por mm3, mm4 /* abs(p-pu) */ movq mm6, mm0 /* 32+QValues */ paddusb mm3, mm3 /* 2*abs(p-pu) */ movq mm4, mm0 /* 32+QValues */ psubusb mm6, mm3 /* zero clampled TmpMod */ movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */ paddb mm4, eight64c /* 32+QValues + 64 */ pxor mm4, mm5 /* convert to a sign number */ pxor mm3, mm5 /* convert to a sign number */ pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */ pand mm3, mm2 /* use sharpen */ paddsb mm6, mm1 /* clamping to high */ psubsb mm6, mm1 /* offset back */ por mm6, mm3 /* Mod value to be stored */ movq2dq xmm6, mm6 punpcklbw xmm7, xmm6 /* 03 xx 02 xx 01 xx 00 xx */ psraw xmm7, 8 /* sign extended */ movdqa [edi], xmm7 /* writeout UDmod */ mov esi, Src mov edi, Des mov eax, UDPointer mov ebx, LRPointer mov ebp, 8 FilterLoop1: movq xmm0, QWORD PTR [esi+edx] /* mm0 = Pixels above */ pxor xmm7, xmm7 /* clear mm7 */ movdqa xmm4, [eax] /* au */ punpcklbw xmm0, xmm7 /* extended to shorts */ movq xmm2, QWORD PTR [esi+ecx] /* mm2 = pixels below */ pmullw xmm0, xmm4 /* pu*au */ movdqa xmm6, [eax+16] /* ad */ punpcklbw xmm2, xmm7 /* extened to shorts*/ movq xmm1, QWORD PTR [esi-1] /* pixel to the left */ pmullw xmm2, xmm6 /* ad*pd */ movdqa xmm3, [ebx] /* al */ punpcklbw xmm1, xmm7 /* extended to shorts */ movq xmm5, QWORD PTR [esi+1] /* pixel to the right */ pmullw xmm1, xmm3 /* al * pl */ paddw xmm4, xmm6 /* au+ad */ punpcklbw xmm5, xmm7 /* extends to shorts */ movdqa xmm6, [ebx+128] /* ar */ pmullw xmm5, xmm6 /* ar * pr */ paddw xmm0, xmm2 /* au*pu + ad*pd */ paddw xmm4, xmm3 /* au+ad+al */ paddw xmm0, xmm1 /* au*pu+ad*pd+al*pl */ paddw xmm4, xmm6 /* au+ad+al+ar */ movq xmm2, QWORD PTR [esi] /* p */ paddw xmm0, xmm5 /* au*pu+ad*pd+al*pl+ar*pr */ /* xmm0 --- au*pu+ad*pd+al*pl+ar*pr */ /* xmm4 --- au + ad + al + ar */ movdqa xmm1, eight128s /* 0080 0080 0080 0080 0080 0080 0080 0080 */ punpcklbw xmm2, xmm7 /* extended to shorts */ psubw xmm1, xmm4 /* 128-(au+ad+al+ar) */ pmullw xmm2, xmm1 /* p*(128-(au+ad+al+ar)) */ add esi, ecx /* Src += Pitch */ movdqa xmm6, eight64s /* 64, 64, 64, 64, 64, 64, 64, 64 */ movdqa xmm7, xmm6 /* 64, 64, 64, 64, 64, 64, 64, 64 */ add eax, 16 /* UDPointer += 8 */ psllw xmm7, 8 /* {16384, .. } */ paddw xmm0, xmm2 /* sum */ add edi, ecx /* Des += Pitch */ paddw xmm0, xmm6 /* sum+B */ add ebx, 16 /* LPointer +=8 */ paddw xmm0, xmm7 /* clamping */ psubusw xmm0, xmm7 /* clamping */ dec ebp psrlw xmm0, 7 /* (sum+B)>>7 */ packuswb xmm0, xmm7 /* pack to 8 bytes */ movq QWORD PTR [edi+edx], xmm0 /* write to destination */ jnz FilterLoop1 pop ebp pop ebx pop eax pop edx pop ecx pop edi pop esi } #endif }