123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856 |
- /****************************************************************************
- *
- * Module Title : OptFunctions.c
- *
- * Description : MMX or otherwise processor specific
- * optimised versions of functions
- *
- * AUTHOR : Paul Wilkins
- *
- *****************************************************************************
- * Revision History
- *
- * 1.07 JBB 26/01/01 Removed unused function
- * 1.06 YWX 23/05/00 Remove the clamping in MmxReconPostProcess()
- * 1.05 YWX 15/05/00 Added MmxReconPostProcess()
- * 1.04 SJL 03/14/00 Added in Tim's versions of MmxReconInter and MmxReconInterHalfPixel2.
- * 1.03 PGW 12/10/99 Changes to reduce uneccessary dependancies.
- * 1.02 PGW 30/08/99 Minor changes to MmxReconInterHalfPixel2().
- * 1.01 PGW 13/07/99 Changes to keep reconstruction data to 16 bit
- * 1.00 PGW 14/06/99 Configuration baseline
- *
- *****************************************************************************
- */
- /*
- Use Tim's optimized version.
- */
- #define USING_TIMS 1
- /****************************************************************************
- * Header Files
- *****************************************************************************
- */
- #define STRICT // Strict type checking.
- #include "codec_common.h"
- #include "reconstruct.h"
- /****************************************************************************
- * Module constants.
- *****************************************************************************
- */
- /****************************************************************************
- * Imports.
- *****************************************************************************
- */
- extern INT32 * XX_LUT;
- /****************************************************************************
- * Exported Global Variables
- *****************************************************************************
- */
- /****************************************************************************
- * Exported Functions
- *****************************************************************************
- */
- /****************************************************************************
- * Module Statics
- *****************************************************************************
- */
- INT16 Ones[4] = {1,1,1,1};
- INT16 OneTwoEight[4] = {128,128,128,128};
- UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
- #pragma warning( disable : 4799 ) // Disable no emms instruction warning!
- /****************************************************************************
- * Forward References
- *****************************************************************************
- */
- /****************************************************************************
- *
- * ROUTINE : MMXReconIntra
- *
- * INPUTS : INT16 * idct
- * Pointer to the output from the idct for this block
- *
- * UINT32 stride
- * Line Length in pixels in recon and reference images
- *
- *
- *
- *
- * OUTPUTS : UINT8 * dest
- * The reconstruction buffer
- *
- * RETURNS : None
- *
- * FUNCTION : Reconstructs an intra block - MMX version
- *
- * SPECIAL NOTES : Tim Murphy's optimized version
- *
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
- {
- (void) TmpDataBuffer;
- __asm
- {
- // u pipe
- // v pipe
- mov eax,[idct] ; Signed 16 bit inputs
- mov edx,[dest] ; Signed 8 bit outputs
- movq mm0,[Eight128s] ; Set mm0 to 0x8080808080808080
- ;
- mov ebx,[stride] ; Line stride in output buffer
- lea ecx,[eax+128] ; Endpoint in input buffer
- loop_label: ;
- movq mm2,[eax] ; First four input values
- ;
- packsswb mm2,[eax+8] ; pack with next(high) four values
- por mm0,mm0 ; stall
- pxor mm2,mm0 ; Convert result to unsigned (same as add 128)
- lea eax,[eax + 16] ; Step source buffer
- cmp eax,ecx ; are we done
- ;
- movq [edx],mm2 ; store results
- ;
- lea edx,[edx+ebx] ; Step output buffer
- jc loop_label ; Loop back if we are not done
- }
- // 6c/8 elts = 9c/8 = 1.125 c/pix
- }
- /****************************************************************************
- *
- * ROUTINE : MmxReconInter
- *
- * INPUTS : UINT8 * RefPtr
- * The last frame reference
- *
- * INT16 * ChangePtr
- * Pointer to the change data
- *
- * UINT32 LineStep
- * Line Length in pixels in recon and ref images
- *
- * OUTPUTS : UINT8 * ReconPtr
- * The reconstruction
- *
- * RETURNS : None
- *
- * FUNCTION : Reconstructs data from last data and change
- *
- * SPECIAL NOTES :
- *
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- #if USING_TIMS
- void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
- {
- (void) TmpDataBuffer;
- _asm {
- push edi
- ;; mov ebx, [ref]
- ;; mov ecx, [diff]
- ;; mov eax, [dest]
- ;; mov edx, [stride]
- mov ebx, [RefPtr]
- mov ecx, [ChangePtr]
- mov eax, [ReconPtr]
- mov edx, [LineStep]
- pxor mm0, mm0
- lea edi, [ecx + 128]
- ;
- L:
- movq mm2, [ebx] ; (+3 misaligned) 8 reference pixels
- ;
- movq mm4, [ecx] ; first 4 changes
- movq mm3, mm2
- movq mm5, [ecx + 8] ; last 4 changes
- punpcklbw mm2, mm0 ; turn first 4 refs into positive 16-bit #s
- paddsw mm2, mm4 ; add in first 4 changes
- punpckhbw mm3, mm0 ; turn last 4 refs into positive 16-bit #s
- paddsw mm3, mm5 ; add in last 4 changes
- add ebx, edx ; next row of reference pixels
- packuswb mm2, mm3 ; pack result to unsigned 8-bit values
- lea ecx, [ecx + 16] ; next row of changes
- cmp ecx, edi ; are we done?
- ;
- movq [eax], mm2 ; store result
- ;
- lea eax, [eax+edx] ; next row of output
- jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
- pop edi
- }
- }
- #else
- void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
- {
- // Note that the line step for the change data is assumed to be 8 * 32 bits.
- __asm
- {
- // Set up data pointers
- mov eax,dword ptr [ReconPtr]
- mov ebx,dword ptr [RefPtr]
- mov ecx,dword ptr [ChangePtr]
- mov edx,dword ptr [LineStep]
- pxor mm6, mm6 ; Blank mmx6
- // Row 1
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 2
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 3
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 4
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 5
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 6
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 7
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
- add ebx,edx ; Step the reference pointer.
- add ecx,16 ; Step the change pointer.
- add eax,edx ; Step the reconstruction pointer
- // Row 8
- // Load the data values. The change data needs to be unpacked to words
- movq mm0,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data
- paddsw mm0, mm2 ; First 4 values
- paddsw mm1, mm4 ; Second 4 values
- // Pack and store
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [eax],mm0 ; Write the data out to the results buffer
-
- //emms ; Clear the MMX state.
- }
- }
- #endif
- /****************************************************************************
- *
- * ROUTINE : MmxReconInterHalfPixel2
- *
- * INPUTS : UINT8 * RefPtr1, RefPtr2
- * The last frame reference
- *
- * INT16 * ChangePtr
- * Pointer to the change data
- *
- * UINT32 LineStep
- * Line Length in pixels in recon and ref images
- *
- *
- * OUTPUTS : UINT8 * ReconPtr
- * The reconstruction
- *
- * RETURNS : None
- *
- * FUNCTION : Reconstructs data from half pixel reference data and change.
- * Half pixel data interpolated from 2 references.
- *
- * SPECIAL NOTES :
- *
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- #if USING_TIMS
- #define A 0
- void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
- UINT8 * RefPtr1, UINT8 * RefPtr2,
- INT16 * ChangePtr, UINT32 LineStep )
- {
- # if A
- static culong FourOnes[2] = { 65537, 65537}; // only read once
- # endif
- (void) TmpDataBuffer;
- _asm {
- push esi
- push edi
- ;; mov ecx, [diff]
- ;; mov esi, [ref1]
- ;; mov edi, [ref2]
- ;; mov ebx, [dest]
- ;; mov edx, [stride]
- mov ecx, [ChangePtr]
- mov esi, [RefPtr1]
- mov edi, [RefPtr2]
- mov ebx, [ReconPtr]
- mov edx, [LineStep]
- lea eax, [ecx+128]
- # if A
- movq mm1, [FourOnes]
- # endif
- pxor mm0, mm0
- L:
- movq mm2, [esi] ; (+3 misaligned) mm2 = row from ref1
- ;
- movq mm4, [edi] ; (+3 misaligned) mm4 = row from ref2
- movq mm3, mm2
- punpcklbw mm2, mm0 ; mm2 = start ref1 as positive 16-bit #s
- movq mm5, mm4
- movq mm6, [ecx] ; mm6 = first 4 changes
- punpckhbw mm3, mm0 ; mm3 = end ref1 as positive 16-bit #s
- movq mm7, [ecx+8] ; mm7 = last 4 changes
- punpcklbw mm4, mm0 ; mm4 = start ref2 as positive 16-bit #s
- punpckhbw mm5, mm0 ; mm5 = end ref2 as positive 16-bit #s
- paddw mm2, mm4 ; mm2 = start (ref1 + ref2)
- paddw mm3, mm5 ; mm3 = end (ref1 + ref2)
- # if A
- paddw mm2, mm1 ; rounding adjustment
- paddw mm3, mm1
- # endif
- psrlw mm2, 1 ; mm2 = start (ref1 + ref2)/2
- psrlw mm3, 1 ; mm3 = end (ref1 + ref2)/2
- paddw mm2, mm6 ; add changes to start
- paddw mm3, mm7 ; add changes to end
- lea ecx, [ecx+16] ; next row idct
- packuswb mm2, mm3 ; pack start|end to unsigned 8-bit
- add esi, edx ; next row ref1
- add edi, edx ; next row ref2
- cmp ecx, eax
- movq [ebx], mm2 ; store result
- ;
- lea ebx, [ebx+edx]
- jc L ; 22c / 8 elts = 33c / 8 pixels = 4.125 c/pix
- pop edi
- pop esi
- }
- }
- #undef A
- #else
- void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
- UINT8 * RefPtr1, UINT8 * RefPtr2,
- INT16 * ChangePtr, UINT32 LineStep )
- {
- UINT8 * TmpDataPtr = (UINT8 *)TmpDataBuffer->TmpReconBuffer;
- // Note that the line step for the change data is assumed to be 8 * 32 bits.
- __asm
- {
- pxor mm6, mm6 ; Blank mmx6
- // Set up data pointers
- mov eax,dword ptr [RefPtr1]
- mov ebx,dword ptr [RefPtr2]
- mov edx,dword ptr [LineStep]
- // Row 1
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- // Load the data values (Ref1 and Ref2) and unpack to signed 16 bit values
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 2
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,16
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm3, mm2 ; Copy data
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+8],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 3
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,32
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr]
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+16],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 4
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,48
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr]
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+24],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 5
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,64
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr]
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+32],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 6
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,80
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr]
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+40],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 7
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,96
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr]
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+48],mm0 ; Write the data out to the temporary results buffer
- add eax,edx ; Step the reference pointers
- add ebx,edx
- // Row 8
- // Load the change pointer
- mov ecx,dword ptr [ChangePtr]
- add ecx,112
- // Load the data values (Ref1 and Ref2).
- movq mm0,dword ptr [eax] ; Load 8 elements of source data
- movq mm2,dword ptr [ebx] ; Load 8 elements of source data
- movq mm1, mm0 ; Copy data
- movq mm3, mm2 ; Copy data
- punpcklbw mm0, mm6 ; Low bytes to words
- punpckhbw mm1, mm6 ; High bytes to words
- punpcklbw mm2, mm6 ; Low bytes to words
- punpckhbw mm3, mm6 ; High bytes to words
- // Average Ref1 and Ref2
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm3 ; Second 4 values
- psrlw mm0, 1
- psrlw mm1, 1
- // Load 8 elements of 16 bit change data
- movq mm2,dword ptr [ecx] ; Load 4 elements of change data
- movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
- // Sum the data reference and difference data
- paddw mm0, mm2 ; First 4 values
- paddw mm1, mm4 ; Second 4 values
- // Pack and store
- mov ecx,dword ptr [TmpDataPtr]
- packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
- movq dword ptr [ecx+56],mm0 ; Write the data out to the temporary results buffer
- // Now copy the results back to the reconstruction buffer.
- mov eax,dword ptr [ReconPtr] ; Load the reconstruction Pointer
- mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
- // Row 1
- movq mm0,dword ptr [ecx] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 2
- movq mm0,dword ptr [ecx+8] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 3
- movq mm0,dword ptr [ecx+16] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 4
- movq mm0,dword ptr [ecx+24] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 5
- movq mm0,dword ptr [ecx+32] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 6
- movq mm0,dword ptr [ecx+40] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 7
- movq mm0,dword ptr [ecx+48] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- // Row 8
- movq mm0,dword ptr [ecx+56] ; Load 8 elements of results data
- movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
- add eax,edx ; Step the reconstruction pointer
- //emms
- }
- }
- #endif
|