/**************************************************************************** * * Module Title : newLoopTest_asm.c * * Description : Codec specific functions * * AUTHOR : Yaowu Xu * ***************************************************************************** * Revision History * * 1.02 YWX 03-Nov-00 Changed confusing variable name * 1.01 YWX 02-Nov-00 Added the set of functions * 1.00 YWX 19-Oct-00 configuration baseline ***************************************************************************** */ /**************************************************************************** * Header Frames ***************************************************************************** */ #define STRICT /* Strict type checking. */ #include "codec_common.h" #include /**************************************************************************** * Module constants. ***************************************************************************** */ #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define FILTER_WEIGHT 128 #define FILTER_SHIFT 7 extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine); static __declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64}; __declspec(align(16)) INT16 BilinearFilters_mmx[8][16] = { { 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0 }, { 112,112,112,112,112,112,112,112, 16, 16, 16, 16, 16, 16, 16, 16 }, { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, { 16, 16, 16, 16, 16, 16, 16, 16, 112,112,112,112,112,112,112,112 } }; __declspec(align(16)) INT16 BicubicFilters_mmx[17][8][32] = { { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -3, -3, -3, -3, -3, -3, -3, -3, 122,122,122,122,122,122,122,122, 9, 9, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, }, { -4, -4, -4, -4, -4, -4, -4, -4, 109,109,109,109,109,109,109,109, 24, 24, 24, 24, 24, 24, 24, 24, -1, -1, -1, -1, -1, -1, -1, -1, }, { -5, -5, -5, -5, -5, -5, -5, -5, 91, 91, 91, 91, 91, 91, 91, 91, 45, 45, 45, 45, 45, 45, 45, 45, -3, -3, -3, -3, -3, -3, -3, -3, }, { -4, -4, -4, -4, -4, -4, -4, -4, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, -4, -4, -4, -4, -4, -4, -4, -4, }, { -3, -3, -3, -3, -3, -3, -3, -3, 45, 45, 45, 45, 45, 45, 45, 45, 91, 91, 91, 91, 91, 91, 91, 91, -5, -5, -5, -5, -5, -5, -5, -5, }, { -1, -1, -1, -1, -1, -1, -1, -1, 24, 24, 24, 24, 24, 24, 24, 24, 109,109,109,109,109,109,109,109, -4, -4, -4, -4, -4, -4, -4, -4, }, { 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 122,122,122,122,122,122,122,122, -3, -3, -3, -3, -3, -3, -3, -3, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -4, -4, -4, -4, -4, -4, -4, -4, 124,124,124,124,124,124,124,124, 9, 9, 9, 9, 9, 9, 9, 9, -1, -1, -1, -1, -1, -1, -1, -1, }, { -5, -5, -5, -5, -5, -5, -5, -5, 110,110,110,110,110,110,110,110, 25, 25, 25, 25, 25, 25, 25, 25, -2, -2, -2, -2, -2, -2, -2, -2, }, { -6, -6, -6, -6, -6, -6, -6, -6, 91, 91, 91, 91, 91, 91, 91, 91, 46, 46, 46, 46, 46, 46, 46, 46, -3, -3, -3, -3, -3, -3, -3, -3, }, { -5, -5, -5, -5, -5, -5, -5, -5, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, -5, -5, -5, -5, -5, -5, -5, -5, }, { -3, -3, -3, -3, -3, -3, -3, -3, 46, 46, 46, 46, 46, 46, 46, 46, 91, 91, 91, 91, 91, 91, 91, 91, -6, -6, -6, -6, -6, -6, -6, -6, }, { -2, -2, -2, -2, -2, -2, -2, -2, 25, 25, 25, 25, 25, 25, 25, 25, 110,110,110,110,110,110,110,110, -5, -5, -5, -5, -5, -5, -5, -5, }, { -1, -1, -1, -1, -1, -1, -1, -1, 9, 9, 9, 9, 9, 9, 9, 9, 124,124,124,124,124,124,124,124, -4, -4, -4, -4, -4, -4, -4, -4, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -4, -4, -4, -4, -4, -4, -4, -4, 123,123,123,123,123,123,123,123, 10, 10, 10, 10, 10, 10, 10, 10, -1, -1, -1, -1, -1, -1, -1, -1, }, { -6, -6, -6, -6, -6, -6, -6, -6, 110,110,110,110,110,110,110,110, 26, 26, 26, 26, 26, 26, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, }, { -7, -7, -7, -7, -7, -7, -7, -7, 92, 92, 92, 92, 92, 92, 92, 92, 47, 47, 47, 47, 47, 47, 47, 47, -4, -4, -4, -4, -4, -4, -4, -4, }, { -6, -6, -6, -6, -6, -6, -6, -6, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -6, -6, -6, -6, -6, -6, -6, -6, }, { -4, -4, -4, -4, -4, -4, -4, -4, 47, 47, 47, 47, 47, 47, 47, 47, 92, 92, 92, 92, 92, 92, 92, 92, -7, -7, -7, -7, -7, -7, -7, -7, }, { -2, -2, -2, -2, -2, -2, -2, -2, 26, 26, 26, 26, 26, 26, 26, 26, 110,110,110,110,110,110,110,110, -6, -6, -6, -6, -6, -6, -6, -6, }, { -1, -1, -1, -1, -1, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 123,123,123,123,123,123,123,123, -4, -4, -4, -4, -4, -4, -4, -4, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -5, -5, -5, -5, -5, -5, -5, -5, 124,124,124,124,124,124,124,124, 10, 10, 10, 10, 10, 10, 10, 10, -1, -1, -1, -1, -1, -1, -1, -1, }, { -7, -7, -7, -7, -7, -7, -7, -7, 110,110,110,110,110,110,110,110, 27, 27, 27, 27, 27, 27, 27, 27, -2, -2, -2, -2, -2, -2, -2, -2, }, { -7, -7, -7, -7, -7, -7, -7, -7, 91, 91, 91, 91, 91, 91, 91, 91, 48, 48, 48, 48, 48, 48, 48, 48, -4, -4, -4, -4, -4, -4, -4, -4, }, { -6, -6, -6, -6, -6, -6, -6, -6, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -6, -6, -6, -6, -6, -6, -6, -6, }, { -4, -4, -4, -4, -4, -4, -4, -4, 48, 48, 48, 48, 48, 48, 48, 48, 92, 92, 92, 92, 92, 92, 92, 92, -8, -8, -8, -8, -8, -8, -8, -8, }, { -2, -2, -2, -2, -2, -2, -2, -2, 27, 27, 27, 27, 27, 27, 27, 27, 110,110,110,110,110,110,110,110, -7, -7, -7, -7, -7, -7, -7, -7, }, { -1, -1, -1, -1, -1, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 124,124,124,124,124,124,124,124, -5, -5, -5, -5, -5, -5, -5, -5, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -6, -6, -6, -6, -6, -6, -6, -6, 124,124,124,124,124,124,124,124, 11, 11, 11, 11, 11, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1, }, { -8, -8, -8, -8, -8, -8, -8, -8, 111,111,111,111,111,111,111,111, 28, 28, 28, 28, 28, 28, 28, 28, -3, -3, -3, -3, -3, -3, -3, -3, }, { -8, -8, -8, -8, -8, -8, -8, -8, 92, 92, 92, 92, 92, 92, 92, 92, 49, 49, 49, 49, 49, 49, 49, 49, -5, -5, -5, -5, -5, -5, -5, -5, }, { -7, -7, -7, -7, -7, -7, -7, -7, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, -7, -7, -7, -7, -7, -7, -7, -7, }, { -5, -5, -5, -5, -5, -5, -5, -5, 49, 49, 49, 49, 49, 49, 49, 49, 92, 92, 92, 92, 92, 92, 92, 92, -8, -8, -8, -8, -8, -8, -8, -8, }, { -3, -3, -3, -3, -3, -3, -3, -3, 28, 28, 28, 28, 28, 28, 28, 28, 111,111,111,111,111,111,111,111, -8, -8, -8, -8, -8, -8, -8, -8, }, { -1, -1, -1, -1, -1, -1, -1, -1, 11, 11, 11, 11, 11, 11, 11, 11, 124,124,124,124,124,124,124,124, -6, -6, -6, -6, -6, -6, -6, -6, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -6, -6, -6, -6, -6, -6, -6, -6, 123,123,123,123,123,123,123,123, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, }, { -9, -9, -9, -9, -9, -9, -9, -9, 111,111,111,111,111,111,111,111, 29, 29, 29, 29, 29, 29, 29, 29, -3, -3, -3, -3, -3, -3, -3, -3, }, { -9, -9, -9, -9, -9, -9, -9, -9, 93, 93, 93, 93, 93, 93, 93, 93, 50, 50, 50, 50, 50, 50, 50, 50, -6, -6, -6, -6, -6, -6, -6, -6, }, { -8, -8, -8, -8, -8, -8, -8, -8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, -8, -8, -8, -8, -8, -8, -8, -8, }, { -6, -6, -6, -6, -6, -6, -6, -6, 50, 50, 50, 50, 50, 50, 50, 50, 93, 93, 93, 93, 93, 93, 93, 93, -9, -9, -9, -9, -9, -9, -9, -9, }, { -3, -3, -3, -3, -3, -3, -3, -3, 29, 29, 29, 29, 29, 29, 29, 29, 111,111,111,111,111,111,111,111, -9, -9, -9, -9, -9, -9, -9, -9, }, { -1, -1, -1, -1, -1, -1, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 123,123,123,123,123,123,123,123, -6, -6, -6, -6, -6, -6, -6, -6, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -7, -7, -7, -7, -7, -7, -7, -7, 124,124,124,124,124,124,124,124, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, }, { -10,-10,-10,-10,-10,-10,-10,-10, 111,111,111,111,111,111,111,111, 30, 30, 30, 30, 30, 30, 30, 30, -3, -3, -3, -3, -3, -3, -3, -3, }, { -10,-10,-10,-10,-10,-10,-10,-10, 93, 93, 93, 93, 93, 93, 93, 93, 51, 51, 51, 51, 51, 51, 51, 51, -6, -6, -6, -6, -6, -6, -6, -6, }, { -9, -9, -9, -9, -9, -9, -9, -9, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, -9, -9, -9, -9, -9, -9, -9, -9, }, { -6, -6, -6, -6, -6, -6, -6, -6, 51, 51, 51, 51, 51, 51, 51, 51, 93, 93, 93, 93, 93, 93, 93, 93, -10,-10,-10,-10,-10,-10,-10,-10, }, { -3, -3, -3, -3, -3, -3, -3, -3, 30, 30, 30, 30, 30, 30, 30, 30, 111,111,111,111,111,111,111,111, -10,-10,-10,-10,-10,-10,-10,-10, }, { -1, -1, -1, -1, -1, -1, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 124,124,124,124,124,124,124,124, -7, -7, -7, -7, -7, -7, -7, -7, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -7, -7, -7, -7, -7, -7, -7, -7, 123,123,123,123,123,123,123,123, 13, 13, 13, 13, 13, 13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, }, { -11,-11,-11,-11,-11,-11,-11,-11, 112,112,112,112,112,112,112,112, 31, 31, 31, 31, 31, 31, 31, 31, -4, -4, -4, -4, -4, -4, -4, -4, }, { -11,-11,-11,-11,-11,-11,-11,-11, 94, 94, 94, 94, 94, 94, 94, 94, 52, 52, 52, 52, 52, 52, 52, 52, -7, -7, -7, -7, -7, -7, -7, -7, }, { -10,-10,-10,-10,-10,-10,-10,-10, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -10,-10,-10,-10,-10,-10,-10,-10, }, { -7, -7, -7, -7, -7, -7, -7, -7, 52, 52, 52, 52, 52, 52, 52, 52, 94, 94, 94, 94, 94, 94, 94, 94, -11,-11,-11,-11,-11,-11,-11,-11, }, { -4, -4, -4, -4, -4, -4, -4, -4, 31, 31, 31, 31, 31, 31, 31, 31, 112,112,112,112,112,112,112,112, -11,-11,-11,-11,-11,-11,-11,-11, }, { -1, -1, -1, -1, -1, -1, -1, -1, 13, 13, 13, 13, 13, 13, 13, 13, 123,123,123,123,123,123,123,123, -7, -7, -7, -7, -7, -7, -7, -7, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -8, -8, -8, -8, -8, -8, -8, -8, 124,124,124,124,124,124,124,124, 13, 13, 13, 13, 13, 13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, }, { -12,-12,-12,-12,-12,-12,-12,-12, 112,112,112,112,112,112,112,112, 32, 32, 32, 32, 32, 32, 32, 32, -4, -4, -4, -4, -4, -4, -4, -4, }, { -12,-12,-12,-12,-12,-12,-12,-12, 94, 94, 94, 94, 94, 94, 94, 94, 53, 53, 53, 53, 53, 53, 53, 53, -7, -7, -7, -7, -7, -7, -7, -7, }, { -10,-10,-10,-10,-10,-10,-10,-10, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -10,-10,-10,-10,-10,-10,-10,-10, }, { -7, -7, -7, -7, -7, -7, -7, -7, 53, 53, 53, 53, 53, 53, 53, 53, 94, 94, 94, 94, 94, 94, 94, 94, -12,-12,-12,-12,-12,-12,-12,-12, }, { -4, -4, -4, -4, -4, -4, -4, -4, 32, 32, 32, 32, 32, 32, 32, 32, 112,112,112,112,112,112,112,112, -12,-12,-12,-12,-12,-12,-12,-12, }, { -1, -1, -1, -1, -1, -1, -1, -1, 13, 13, 13, 13, 13, 13, 13, 13, 124,124,124,124,124,124,124,124, -8, -8, -8, -8, -8, -8, -8, -8, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -9, -9, -9, -9, -9, -9, -9, -9, 124,124,124,124,124,124,124,124, 14, 14, 14, 14, 14, 14, 14, 14, -1, -1, -1, -1, -1, -1, -1, -1, }, { -13,-13,-13,-13,-13,-13,-13,-13, 112,112,112,112,112,112,112,112, 33, 33, 33, 33, 33, 33, 33, 33, -4, -4, -4, -4, -4, -4, -4, -4, }, { -13,-13,-13,-13,-13,-13,-13,-13, 95, 95, 95, 95, 95, 95, 95, 95, 54, 54, 54, 54, 54, 54, 54, 54, -8, -8, -8, -8, -8, -8, -8, -8, }, { -11,-11,-11,-11,-11,-11,-11,-11, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, -11,-11,-11,-11,-11,-11,-11,-11, }, { -8, -8, -8, -8, -8, -8, -8, -8, 54, 54, 54, 54, 54, 54, 54, 54, 95, 95, 95, 95, 95, 95, 95, 95, -13,-13,-13,-13,-13,-13,-13,-13, }, { -4, -4, -4, -4, -4, -4, -4, -4, 33, 33, 33, 33, 33, 33, 33, 33, 112,112,112,112,112,112,112,112, -13,-13,-13,-13,-13,-13,-13,-13, }, { -1, -1, -1, -1, -1, -1, -1, -1, 14, 14, 14, 14, 14, 14, 14, 14, 124,124,124,124,124,124,124,124, -9, -9, -9, -9, -9, -9, -9, -9, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -9, -9, -9, -9, -9, -9, -9, -9, 123,123,123,123,123,123,123,123, 15, 15, 15, 15, 15, 15, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, }, { -14,-14,-14,-14,-14,-14,-14,-14, 113,113,113,113,113,113,113,113, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5, }, { -14,-14,-14,-14,-14,-14,-14,-14, 95, 95, 95, 95, 95, 95, 95, 95, 55, 55, 55, 55, 55, 55, 55, 55, -8, -8, -8, -8, -8, -8, -8, -8, }, { -12,-12,-12,-12,-12,-12,-12,-12, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, -12,-12,-12,-12,-12,-12,-12,-12, }, { -8, -8, -8, -8, -8, -8, -8, -8, 55, 55, 55, 55, 55, 55, 55, 55, 95, 95, 95, 95, 95, 95, 95, 95, -14,-14,-14,-14,-14,-14,-14,-14, }, { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 112,112,112,112,112,112,112,112, -13,-13,-13,-13,-13,-13,-13,-13, }, { -1, -1, -1, -1, -1, -1, -1, -1, 15, 15, 15, 15, 15, 15, 15, 15, 123,123,123,123,123,123,123,123, -9, -9, -9, -9, -9, -9, -9, -9, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -10,-10,-10,-10,-10,-10,-10,-10, 124,124,124,124,124,124,124,124, 15, 15, 15, 15, 15, 15, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, }, { -14,-14,-14,-14,-14,-14,-14,-14, 113,113,113,113,113,113,113,113, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5, }, { -15,-15,-15,-15,-15,-15,-15,-15, 96, 96, 96, 96, 96, 96, 96, 96, 56, 56, 56, 56, 56, 56, 56, 56, -9, -9, -9, -9, -9, -9, -9, -9, }, { -13,-13,-13,-13,-13,-13,-13,-13, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, -13,-13,-13,-13,-13,-13,-13,-13, }, { -9, -9, -9, -9, -9, -9, -9, -9, 56, 56, 56, 56, 56, 56, 56, 56, 96, 96, 96, 96, 96, 96, 96, 96, -15,-15,-15,-15,-15,-15,-15,-15, }, { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 113,113,113,113,113,113,113,113, -14,-14,-14,-14,-14,-14,-14,-14, }, { -1, -1, -1, -1, -1, -1, -1, -1, 15, 15, 15, 15, 15, 15, 15, 15, 124,124,124,124,124,124,124,124, -10,-10,-10,-10,-10,-10,-10,-10, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -10,-10,-10,-10,-10,-10,-10,-10, 123,123,123,123,123,123,123,123, 16, 16, 16, 16, 16, 16, 16, 16, -1, -1, -1, -1, -1, -1, -1, -1, }, { -15,-15,-15,-15,-15,-15,-15,-15, 113,113,113,113,113,113,113,113, 35, 35, 35, 35, 35, 35, 35, 35, -5, -5, -5, -5, -5, -5, -5, -5, }, { -16,-16,-16,-16,-16,-16,-16,-16, 98, 98, 98, 98, 98, 98, 98, 98, 56, 56, 56, 56, 56, 56, 56, 56, -10,-10,-10,-10,-10,-10,-10,-10, }, { -14,-14,-14,-14,-14,-14,-14,-14, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, -14,-14,-14,-14,-14,-14,-14,-14, }, { -10,-10,-10,-10,-10,-10,-10,-10, 56, 56, 56, 56, 56, 56, 56, 56, 98, 98, 98, 98, 98, 98, 98, 98, -16,-16,-16,-16,-16,-16,-16,-16, }, { -5, -5, -5, -5, -5, -5, -5, -5, 35, 35, 35, 35, 35, 35, 35, 35, 113,113,113,113,113,113,113,113, -15,-15,-15,-15,-15,-15,-15,-15, }, { -1, -1, -1, -1, -1, -1, -1, -1, 16, 16, 16, 16, 16, 16, 16, 16, 123,123,123,123,123,123,123,123, -10,-10,-10,-10,-10,-10,-10,-10, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -11,-11,-11,-11,-11,-11,-11,-11, 124,124,124,124,124,124,124,124, 17, 17, 17, 17, 17, 17, 17, 17, -2, -2, -2, -2, -2, -2, -2, -2, }, { -16,-16,-16,-16,-16,-16,-16,-16, 113,113,113,113,113,113,113,113, 36, 36, 36, 36, 36, 36, 36, 36, -5, -5, -5, -5, -5, -5, -5, -5, }, { -17,-17,-17,-17,-17,-17,-17,-17, 98, 98, 98, 98, 98, 98, 98, 98, 57, 57, 57, 57, 57, 57, 57, 57, -10,-10,-10,-10,-10,-10,-10,-10, }, { -14,-14,-14,-14,-14,-14,-14,-14, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, -14,-14,-14,-14,-14,-14,-14,-14, }, { -10,-10,-10,-10,-10,-10,-10,-10, 57, 57, 57, 57, 57, 57, 57, 57, 98, 98, 98, 98, 98, 98, 98, 98, -17,-17,-17,-17,-17,-17,-17,-17, }, { -5, -5, -5, -5, -5, -5, -5, -5, 36, 36, 36, 36, 36, 36, 36, 36, 113,113,113,113,113,113,113,113, -16,-16,-16,-16,-16,-16,-16,-16, }, { -2, -2, -2, -2, -2, -2, -2, -2, 17, 17, 17, 17, 17, 17, 17, 17, 124,124,124,124,124,124,124,124, -11,-11,-11,-11,-11,-11,-11,-11, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -12,-12,-12,-12,-12,-12,-12,-12, 125,125,125,125,125,125,125,125, 17, 17, 17, 17, 17, 17, 17, 17, -2, -2, -2, -2, -2, -2, -2, -2, }, { -17,-17,-17,-17,-17,-17,-17,-17, 114,114,114,114,114,114,114,114, 37, 37, 37, 37, 37, 37, 37, 37, -6, -6, -6, -6, -6, -6, -6, -6, }, { -18,-18,-18,-18,-18,-18,-18,-18, 99, 99, 99, 99, 99, 99, 99, 99, 58, 58, 58, 58, 58, 58, 58, 58, -11,-11,-11,-11,-11,-11,-11,-11, }, { -15,-15,-15,-15,-15,-15,-15,-15, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, -15,-15,-15,-15,-15,-15,-15,-15, }, { -11,-11,-11,-11,-11,-11,-11,-11, 58, 58, 58, 58, 58, 58, 58, 58, 99, 99, 99, 99, 99, 99, 99, 99, -18,-18,-18,-18,-18,-18,-18,-18, }, { -6, -6, -6, -6, -6, -6, -6, -6, 37, 37, 37, 37, 37, 37, 37, 37, 114,114,114,114,114,114,114,114, -17,-17,-17,-17,-17,-17,-17,-17, }, { -2, -2, -2, -2, -2, -2, -2, -2, 17, 17, 17, 17, 17, 17, 17, 17, 125,125,125,125,125,125,125,125, -12,-12,-12,-12,-12,-12,-12,-12, }, }, { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { -12,-12,-12,-12,-12,-12,-12,-12, 124,124,124,124,124,124,124,124, 18, 18, 18, 18, 18, 18, 18, 18, -2, -2, -2, -2, -2, -2, -2, -2, }, { -18,-18,-18,-18,-18,-18,-18,-18, 114,114,114,114,114,114,114,114, 38, 38, 38, 38, 38, 38, 38, 38, -6, -6, -6, -6, -6, -6, -6, -6, }, { -19,-19,-19,-19,-19,-19,-19,-19, 99, 99, 99, 99, 99, 99, 99, 99, 59, 59, 59, 59, 59, 59, 59, 59, -11,-11,-11,-11,-11,-11,-11,-11, }, { -16,-16,-16,-16,-16,-16,-16,-16, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, -16,-16,-16,-16,-16,-16,-16,-16, }, { -11,-11,-11,-11,-11,-11,-11,-11, 59, 59, 59, 59, 59, 59, 59, 59, 99, 99, 99, 99, 99, 99, 99, 99, -19,-19,-19,-19,-19,-19,-19,-19, }, { -6, -6, -6, -6, -6, -6, -6, -6, 38, 38, 38, 38, 38, 38, 38, 38, 114,114,114,114,114,114,114,114, -18,-18,-18,-18,-18,-18,-18,-18, }, { -2, -2, -2, -2, -2, -2, -2, -2, 18, 18, 18, 18, 18, 18, 18, 18, 124,124,124,124,124,124,124,124, -12,-12,-12,-12,-12,-12,-12,-12, }, }, // Dummy entry for VP61 supporty { { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -4, -4, -4, -4, -4, -4, -4, -4, 118,118,118,118,118,118,118,118, 16, 16, 16, 16, 16, 16, 16, 16, -2, -2, -2, -2, -2, -2, -2, -2 }, { -7, -7, -7, -7, -7, -7, -7, -7, 106,106,106,106,106,106,106,106, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5 }, { -8, -8, -8, -8, -8, -8, -8, -8, 90, 90, 90, 90, 90, 90, 90, 90, 53, 53, 53, 53, 53, 53, 53, 53, -7, -7, -7, -7, -7, -7, -7, -7 }, { -8, -8, -8, -8, -8, -8, -8, -8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, -8, -8, -8, -8, -8, -8, -8, -8 }, { -7, -7, -7, -7, -7, -7, -7, -7, 53, 53, 53, 53, 53, 53, 53, 53, 90, 90, 90, 90, 90, 90, 90, 90, -8, -8, -8, -8, -8, -8, -8, -8 }, { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 106,106,106,106,106,106,106,106, -7, -7, -7, -7, -7, -7, -7, -7 }, { -2, -2, -2, -2, -2, -2, -2, -2, 16, 16, 16, 16, 16, 16, 16, 16, 118,118,118,118,118,118,118,118, -4, -4, -4, -4, -4, -4, -4, -4 } } }; void FilterBlock1d_h_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter ) { __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi+ 16] ; mm3 *= kernel 0 modifiers. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers. mov edi,OutputPtr mov esi,SrcPtr dec esi mov ecx, DWORD PTR OutputHeight mov eax, OutputWidth ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p-1..p6 movq mm4, mm3 ; mm4 = p-1..p6 punpcklbw mm3, mm0 ; mm3 = p-1..p2 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. psrlq mm4, 24 ; mm4 = p2..p6 movq mm5, mm4 ; mm5 = p2..p6 punpcklbw mm5, mm0 ; mm5 = p2..p5 pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers paddsw mm3, mm5 ; mm3 += mm5 movq mm4, [esi+1] ; mm4 = p0..p6 movq mm5, mm4 ; mm5 = p0..p6 punpcklbw mm5, mm0 ; mm5 = p0..p3 pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers paddsw mm3, mm5 ; mm3 += mm5 psrlq mm4, 8 ; mm4 = p1..p6 movq mm5, mm4 ; mm5 = p1..p6 punpcklbw mm5, mm0 ; mm5 = p1..p4 pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers paddsw mm3, mm5 ; mm3 += mm5 paddsw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi],mm3 ; store the results in the destination movq mm3, [esi+4] ; mm3 = p-1..p6 movq mm4, mm3 ; mm4 = p-1..p6 punpcklbw mm3, mm0 ; mm3 = p-1..p2 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. psrlq mm4, 24 ; mm4 = p2..p6 movq mm5, mm4 ; mm5 = p2..p6 punpcklbw mm5, mm0 ; mm5 = p2..p5 pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers paddsw mm3, mm5 ; mm3 += mm5 movq mm4, [esi+5] ; mm4 = p0..p6 movq mm5, mm4 ; mm5 = p0..p6 punpcklbw mm5, mm0 ; mm5 = p0..p3 pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers paddsw mm3, mm5 ; mm3 += mm5 psrlq mm4, 8 ; mm4 = p1..p6 movq mm5, mm4 ; mm5 = p1..p6 punpcklbw mm5, mm0 ; mm5 = p1..p4 pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers paddsw mm3, mm5 ; mm3 += mm5 paddsw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi+4],mm3 ; store the results in the destination add esi,SrcPixelsPerLine ; next line add edi,eax; dec ecx ; decrement count jnz nextrow ; next row } } void FilterBlock1d_v_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter ) { __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers. mov edx, PixelsPerLine mov edi, OutputPtr mov esi, SrcPtr sub esi, PixelsPerLine mov ecx, DWORD PTR OutputHeight mov eax, OutputWidth ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p0..p8 punpcklbw mm3, mm0 ; mm3 = p0..p3 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. add esi, edx ; move source forward 1 line to avoid 3 * pitch movq mm4, [esi+2*edx] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers. paddsw mm3, mm4 ; mm3 += mm4 movq mm4, [esi ] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers. paddsw mm3, mm4 ; mm3 += mm4 movq mm4, [esi +edx] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers. paddsw mm3, mm4 ; mm3 += mm4 paddsw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and saturate movd [edi],mm3 ; store the results in the destination sub esi, edx ; subtract edx to get back to -1 column movq mm3, [esi+4] ; mm3 = p4..p12 punpcklbw mm3, mm0 ; mm3 = p4..p7 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. add esi, edx ; move source forward 1 line to avoid 3 * pitch movq mm4, [esi+2*edx+4] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers. paddsw mm3, mm4 ; mm3 += mm4 movq mm4, [esi +4] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers. paddsw mm3, mm4 ; mm3 += mm4 movq mm4, [esi +edx+4] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers. paddsw mm3, mm4 ; mm3 += mm4 paddsw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and saturate movd [edi+4],mm3 ; store the results in the destination // the subsequent iterations repeat 3 out of 4 of these reads. Since the // recon block should be in cache this shouldn't cost much. Its obviously // avoidable!!!. add edi,eax; dec ecx ; decrement count jnz nextrow ; next row } } void FilterBlock1d_h_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter ) { __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi+ 16] ; mm3 *= kernel 0 modifiers. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers. mov edi,OutputPtr mov esi,SrcPtr dec esi mov ecx, DWORD PTR OutputHeight mov eax, OutputWidth ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p-1..p6 movq mm4, mm3 ; mm4 = p-1..p6 punpcklbw mm3, mm0 ; mm3 = p-1..p2 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. psrlq mm4, 8 ; mm4 = p0..p6 movq mm5, mm4 ; mm5 = p0..p6 punpcklbw mm5, mm0 ; mm5 = p0..p3 pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers paddw mm3, mm5 ; mm3 += mm5 psrlq mm4, 8 ; mm4 = p1..p6 movq mm5, mm4 ; mm5 = p1..p6 punpcklbw mm5, mm0 ; mm5 = p1..p4 pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers paddw mm3, mm5 ; mm3 += mm5 psrlq mm4, 8 ; mm4 = p2..p6 movq mm5, mm4 ; mm5 = p2..p6 punpcklbw mm5, mm0 ; mm5 = p2..p5 pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers paddw mm3, mm5 ; mm3 += mm5 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi],mm3 ; store the results in the destination movq mm3, [esi+4] ; mm3 = p-1..p6 movq mm4, mm3 ; mm4 = p-1..p6 punpcklbw mm3, mm0 ; mm3 = p-1..p2 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. psrlq mm4, 8 ; mm4 = p0..p6 movq mm5, mm4 ; mm5 = p0..p6 punpcklbw mm5, mm0 ; mm5 = p0..p3 pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers paddw mm3, mm5 ; mm3 += mm5 psrlq mm4, 8 ; mm4 = p1..p6 movq mm5, mm4 ; mm5 = p1..p6 punpcklbw mm5, mm0 ; mm5 = p1..p4 pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers paddw mm3, mm5 ; mm3 += mm5 psrlq mm4, 8 ; mm4 = p2..p6 movq mm5, mm4 ; mm5 = p2..p6 punpcklbw mm5, mm0 ; mm5 = p2..p5 pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers paddw mm3, mm5 ; mm3 += mm5 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi+4],mm3 ; store the results in the destination add esi,SrcPixelsPerLine ; next line add edi,eax; dec ecx ; decrement count jnz nextrow ; next row } } void FilterBlock1d_v_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter ) { __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers. mov edx, PixelsPerLine mov edi, OutputPtr mov esi, SrcPtr sub esi, PixelsPerLine mov ecx, DWORD PTR OutputHeight mov eax, OutputWidth ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p0..p8 punpcklbw mm3, mm0 ; mm3 = p0..p3 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. movq mm4, [esi +edx ] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers. paddw mm3, mm4 ; mm3 += mm4 movq mm4, [esi +2*edx] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers. paddw mm3, mm4 ; mm3 += mm4 add esi, edx ; move source forward 1 line to avoid 3 * pitch movq mm4, [esi+2*edx] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers. paddw mm3, mm4 ; mm3 += mm4 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and saturate movd [edi],mm3 ; store the results in the destination sub esi, edx ; subtract edx to get back to -1 column movq mm3, [esi+4] ; mm3 = p4..p12 punpcklbw mm3, mm0 ; mm3 = p4..p7 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. movq mm4, [esi +edx +4] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers. paddw mm3, mm4 ; mm3 += mm4 movq mm4, [esi +2*edx+4] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers. paddw mm3, mm4 ; mm3 += mm4 add esi, edx ; move source forward 1 line to avoid 3 * pitch movq mm4, [esi+2*edx+4] ; mm4 = p0..p8 punpcklbw mm4, mm0 ; mm4 = p0..p3 pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers. paddw mm3, mm4 ; mm3 += mm4 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and saturate movd [edi+4],mm3 ; store the results in the destination // the subsequent iterations repeat 3 out of 4 of these reads. Since the // recon block should be in cache this shouldn't cost much. Its obviously // avoidable!!!. add edi,eax; dec ecx ; decrement count jnz nextrow ; next row } } void FilterBlock1d_hb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter ) { __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers. mov edi,OutputPtr mov esi,SrcPtr mov ecx, DWORD PTR OutputHeight mov eax, OutputWidth ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p-1..p14 movq mm4, mm3 ; mm4 = p-1..p14 punpcklbw mm3, mm0 ; mm3 = p-1..p6 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. psrlq mm4, 8 ; mm4 = p0..p13 movq mm5, mm4 ; mm5 = p0..p13 punpcklbw mm5, mm0 ; mm5 = p0..p7 pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers paddw mm3, mm5 ; mm3 += mm5 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi],mm3 ; store the results in the destination movq mm3, [esi+4] ; mm3 = p-1..p14 movq mm4, mm3 ; mm4 = p-1..p14 punpcklbw mm3, mm0 ; mm3 = p-1..p6 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. psrlq mm4, 8 ; mm4 = p0..p13 movq mm5, mm4 ; mm5 = p0..p13 punpcklbw mm5, mm0 ; mm5 = p0..p7 pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers paddw mm3, mm5 ; mm3 += mm5 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi+4],mm3 ; store the results in the destination add esi,SrcPixelsPerLine ; next line add edi,eax; dec ecx ; decrement count jnz nextrow ; next row } } void FilterBlock1d_vb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter ) { __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers. mov edx, PixelsPerLine mov edi, OutputPtr mov esi, SrcPtr mov ecx, DWORD PTR OutputHeight mov eax, OutputWidth ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p0..p16 punpcklbw mm3, mm0 ; mm3 = p0..p8 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. movq mm4, [esi +edx ] ; mm4 = p0..p16 punpcklbw mm4, mm0 ; mm4 = p0..p8 pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers. paddw mm3, mm4 ; mm3 += mm4 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi],mm3 ; store the results in the destination movq mm3, [esi+4] ; mm3 = p0..p16 punpcklbw mm3, mm0 ; mm3 = p0..p8 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. movq mm4, [esi +edx +4] ; mm4 = p0..p16 punpcklbw mm4, mm0 ; mm4 = p0..p8 pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers. paddw mm3, mm4 ; mm3 += mm4 paddw mm3, rd ; mm3 += round value psraw mm3, FILTER_SHIFT ; mm3 /= 128 packuswb mm3, mm0 ; pack and unpack to saturate movd [edi+4],mm3 ; store the results in the destination // the subsequent iterations repeat 3 out of 4 of these reads. Since the // recon block should be in cache this shouldn't cost much. Its obviously // avoidable!!!. add esi,edx add edi,eax dec ecx ; decrement count jnz nextrow ; next row } } /**************************************************************************** * * ROUTINE : FilterBlock2dBil * * INPUTS : Pointer to source data * * OUTPUTS : Filtered data * * RETURNS : None. * * FUNCTION : Applies a bilinear filter on the intput data to produce * a predictor block (UINT16) * * SPECIAL NOTES : * * ERRORS : None. * ****************************************************************************/ _inline void FilterBlock2dBil_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter ) { __asm { mov eax, HFilter ; mov edi, OutputPtr ; mov esi, SrcPtr ; lea ecx, [edi+64] ; mov edx, SrcPixelsPerLine ; movq mm1, [eax] ; movq mm2, [eax+16] ; mov eax, VFilter ; pxor mm0, mm0 ; // get the first horizontal line done ; movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movq mm4, mm3 ; make a copy of current line punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 punpckhbw mm4, mm0 ; pmullw mm3, mm1 ; pmullw mm4, mm1 ; movq mm5, [esi+1] ; movq mm6, mm5 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 ; pmullw mm5, mm2 ; pmullw mm6, mm2 ; paddw mm3, mm5 ; paddw mm4, mm6 ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; movq mm7, mm3 ; packuswb mm7, mm4 ; add esi, edx ; next line NextRow: movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movq mm4, mm3 ; make a copy of current line punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 punpckhbw mm4, mm0 ; pmullw mm3, mm1 ; pmullw mm4, mm1 ; movq mm5, [esi+1] ; movq mm6, mm5 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 ; pmullw mm5, mm2 ; pmullw mm6, mm2 ; paddw mm3, mm5 ; paddw mm4, mm6 ; movq mm5, mm7 ; movq mm6, mm7 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 pmullw mm5, [eax] ; pmullw mm6, [eax] ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; movq mm7, mm3 ; packuswb mm7, mm4 ; pmullw mm3, [eax+16] ; pmullw mm4, [eax+16] ; paddw mm3, mm5 ; paddw mm4, mm6 ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; packuswb mm3, mm4 movq [edi], mm3 ; store the results in the destination add esi, edx ; next line add edi, 8 ; cmp edi, ecx ; jne NextRow } // First filter 1d Horizontal //FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter ); // Now filter Verticaly //FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter); } /**************************************************************************** * * ROUTINE : FilterBlockBil_8 * * INPUTS : ReconPtr1, ReconPtr12 * Two pointers into the block of data to be filtered * These pointers bound the fractional pel position * PixelsPerLine * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12 * Modx, ModY * The fractional pel bits used to select a filter. * * * OUTPUTS : ReconRefPtr * A pointer to an 8x8 buffer into which UINT8 filtered data is written. * * RETURNS : None. * * FUNCTION : Produces a bilinear filtered fractional pel prediction block * with UINT8 output * * SPECIAL NOTES : * * ERRORS : None. * ****************************************************************************/ void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY ) { int diff; // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left ) diff=ReconPtr2-ReconPtr1; // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision). // This works out to be what we want... despite the pointer swapping that goes on below. // For example... if the X component of the vector is a +ve ModX = X%8. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1. if(diff<0) { // swap pointers so ReconPtr1 smaller UINT8 *temp=ReconPtr1; ReconPtr1=ReconPtr2; ReconPtr2=temp; diff= (int)(ReconPtr2-ReconPtr1); } if( diff==1 ) { FilterBlock1d_hb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] ); } else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only { FilterBlock1d_vb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]); } else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right { FilterBlock2dBil_mmx( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] ); } else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left { FilterBlock2dBil_mmx( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] ); } } /**************************************************************************** * * ROUTINE : FilterBlock2d * * INPUTS : Pointer to source data * * OUTPUTS : Filtered data * * RETURNS : None. * * FUNCTION : Applies a 2d 4 tap filter on the intput data to produce * a predictor block (UINT16) * * SPECIAL NOTES : * * ERRORS : None. * ****************************************************************************/ void FilterBlock2d_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter ) { UINT8 Intermediate[256]; // First filter 1d Horizontal FilterBlock1d_h_mmx(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter ); // Now filter Verticaly FilterBlock1d_v_mmx(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter); } /**************************************************************************** * * ROUTINE : FilterBlock * * INPUTS : ReconPtr1, ReconPtr12 * Two pointers into the block of data to be filtered * These pointers bound the fractional pel position * PixelsPerLine * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12 * Modx, ModY * The fractional pel bits used to select a filter. * UseBicubic * Whether to use the bicubuc filter set or the bilinear set * * * OUTPUTS : ReconRefPtr * A pointer to an 8x8 buffer into which the filtered data is written. * * RETURNS : None. * * FUNCTION : Produces a filtered fractional pel prediction block * using bilinear or bicubic filters * * SPECIAL NOTES : * * ERRORS : None. * ****************************************************************************/ void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha ) { int diff; UINT8 Intermediate[256]; // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left ) diff=ReconPtr2-ReconPtr1; // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision). // This works out to be what we want... despite the pointer swapping that goes on below. // For example... if the X component of the vector is a +ve ModX = X%8. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1. if(diff<0) { // swap pointers so ReconPtr1 smaller UINT8 *temp=ReconPtr1; ReconPtr1=ReconPtr2; ReconPtr2=temp; diff= (int)(ReconPtr2-ReconPtr1); } if(!diff) { return; } if( diff==1 ) { // Fractional pixel in horizontal only if ( UseBicubic ) FilterBlock1d_h_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] ); else FilterBlock1d_hb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] ); } else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only { if ( UseBicubic ) FilterBlock1d_v_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]); else FilterBlock1d_vb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]); } else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right { if ( UseBicubic ) FilterBlock2d_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] ); else FilterBlock2dBil_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] ); } else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left { if ( UseBicubic ) FilterBlock2d_mmx( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] ); else FilterBlock2dBil_mmx( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] ); } UnpackBlock_MMX( Intermediate, ReconRefPtr, 8 ); }