123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053 |
- /****************************************************************************
- *
- * Module Title : newLoopTest_asm.c
- *
- * Description : Codec specific functions
- *
- * AUTHOR : Yaowu Xu
- *
- *****************************************************************************
- * Revision History
- *
- * 1.02 YWX 03-Nov-00 Changed confusing variable name
- * 1.01 YWX 02-Nov-00 Added the set of functions
- * 1.00 YWX 19-Oct-00 configuration baseline
- *****************************************************************************
- */
- /****************************************************************************
- * Header Frames
- *****************************************************************************
- */
- #define STRICT /* Strict type checking. */
- #include "codec_common.h"
- #include <math.h>
- /****************************************************************************
- * Module constants.
- *****************************************************************************
- */
- #define MIN(a, b) (((a) < (b)) ? (a) : (b))
- #define FILTER_WEIGHT 128
- #define FILTER_SHIFT 7
- extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
- static __declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};
- __declspec(align(16)) INT16 BilinearFilters_mmx[8][16] =
- {
- { 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0 },
- { 112,112,112,112,112,112,112,112, 16, 16, 16, 16, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
- { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
- { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
- { 16, 16, 16, 16, 16, 16, 16, 16, 112,112,112,112,112,112,112,112 }
- };
- __declspec(align(16)) INT16 BicubicFilters_mmx[17][8][32] =
- {
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -3, -3, -3, -3, -3, -3, -3, -3, 122,122,122,122,122,122,122,122, 9, 9, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 109,109,109,109,109,109,109,109, 24, 24, 24, 24, 24, 24, 24, 24, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 91, 91, 91, 91, 91, 91, 91, 91, 45, 45, 45, 45, 45, 45, 45, 45, -3, -3, -3, -3, -3, -3, -3, -3, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, -4, -4, -4, -4, -4, -4, -4, -4, },
- { -3, -3, -3, -3, -3, -3, -3, -3, 45, 45, 45, 45, 45, 45, 45, 45, 91, 91, 91, 91, 91, 91, 91, 91, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 24, 24, 24, 24, 24, 24, 24, 24, 109,109,109,109,109,109,109,109, -4, -4, -4, -4, -4, -4, -4, -4, },
- { 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 122,122,122,122,122,122,122,122, -3, -3, -3, -3, -3, -3, -3, -3, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 124,124,124,124,124,124,124,124, 9, 9, 9, 9, 9, 9, 9, 9, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 110,110,110,110,110,110,110,110, 25, 25, 25, 25, 25, 25, 25, 25, -2, -2, -2, -2, -2, -2, -2, -2, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 91, 91, 91, 91, 91, 91, 91, 91, 46, 46, 46, 46, 46, 46, 46, 46, -3, -3, -3, -3, -3, -3, -3, -3, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -3, -3, -3, -3, -3, -3, -3, -3, 46, 46, 46, 46, 46, 46, 46, 46, 91, 91, 91, 91, 91, 91, 91, 91, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -2, -2, -2, -2, -2, -2, -2, -2, 25, 25, 25, 25, 25, 25, 25, 25, 110,110,110,110,110,110,110,110, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 9, 9, 9, 9, 9, 9, 9, 9, 124,124,124,124,124,124,124,124, -4, -4, -4, -4, -4, -4, -4, -4, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 123,123,123,123,123,123,123,123, 10, 10, 10, 10, 10, 10, 10, 10, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 110,110,110,110,110,110,110,110, 26, 26, 26, 26, 26, 26, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 92, 92, 92, 92, 92, 92, 92, 92, 47, 47, 47, 47, 47, 47, 47, 47, -4, -4, -4, -4, -4, -4, -4, -4, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 47, 47, 47, 47, 47, 47, 47, 47, 92, 92, 92, 92, 92, 92, 92, 92, -7, -7, -7, -7, -7, -7, -7, -7, },
- { -2, -2, -2, -2, -2, -2, -2, -2, 26, 26, 26, 26, 26, 26, 26, 26, 110,110,110,110,110,110,110,110, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 123,123,123,123,123,123,123,123, -4, -4, -4, -4, -4, -4, -4, -4, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 124,124,124,124,124,124,124,124, 10, 10, 10, 10, 10, 10, 10, 10, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 110,110,110,110,110,110,110,110, 27, 27, 27, 27, 27, 27, 27, 27, -2, -2, -2, -2, -2, -2, -2, -2, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 91, 91, 91, 91, 91, 91, 91, 91, 48, 48, 48, 48, 48, 48, 48, 48, -4, -4, -4, -4, -4, -4, -4, -4, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 48, 48, 48, 48, 48, 48, 48, 48, 92, 92, 92, 92, 92, 92, 92, 92, -8, -8, -8, -8, -8, -8, -8, -8, },
- { -2, -2, -2, -2, -2, -2, -2, -2, 27, 27, 27, 27, 27, 27, 27, 27, 110,110,110,110,110,110,110,110, -7, -7, -7, -7, -7, -7, -7, -7, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 124,124,124,124,124,124,124,124, -5, -5, -5, -5, -5, -5, -5, -5, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 124,124,124,124,124,124,124,124, 11, 11, 11, 11, 11, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -8, -8, -8, -8, -8, -8, -8, -8, 111,111,111,111,111,111,111,111, 28, 28, 28, 28, 28, 28, 28, 28, -3, -3, -3, -3, -3, -3, -3, -3, },
- { -8, -8, -8, -8, -8, -8, -8, -8, 92, 92, 92, 92, 92, 92, 92, 92, 49, 49, 49, 49, 49, 49, 49, 49, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, -7, -7, -7, -7, -7, -7, -7, -7, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 49, 49, 49, 49, 49, 49, 49, 49, 92, 92, 92, 92, 92, 92, 92, 92, -8, -8, -8, -8, -8, -8, -8, -8, },
- { -3, -3, -3, -3, -3, -3, -3, -3, 28, 28, 28, 28, 28, 28, 28, 28, 111,111,111,111,111,111,111,111, -8, -8, -8, -8, -8, -8, -8, -8, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 11, 11, 11, 11, 11, 11, 11, 11, 124,124,124,124,124,124,124,124, -6, -6, -6, -6, -6, -6, -6, -6, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 123,123,123,123,123,123,123,123, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -9, -9, -9, -9, -9, -9, -9, -9, 111,111,111,111,111,111,111,111, 29, 29, 29, 29, 29, 29, 29, 29, -3, -3, -3, -3, -3, -3, -3, -3, },
- { -9, -9, -9, -9, -9, -9, -9, -9, 93, 93, 93, 93, 93, 93, 93, 93, 50, 50, 50, 50, 50, 50, 50, 50, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -8, -8, -8, -8, -8, -8, -8, -8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, -8, -8, -8, -8, -8, -8, -8, -8, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 50, 50, 50, 50, 50, 50, 50, 50, 93, 93, 93, 93, 93, 93, 93, 93, -9, -9, -9, -9, -9, -9, -9, -9, },
- { -3, -3, -3, -3, -3, -3, -3, -3, 29, 29, 29, 29, 29, 29, 29, 29, 111,111,111,111,111,111,111,111, -9, -9, -9, -9, -9, -9, -9, -9, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 123,123,123,123,123,123,123,123, -6, -6, -6, -6, -6, -6, -6, -6, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 124,124,124,124,124,124,124,124, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 111,111,111,111,111,111,111,111, 30, 30, 30, 30, 30, 30, 30, 30, -3, -3, -3, -3, -3, -3, -3, -3, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 93, 93, 93, 93, 93, 93, 93, 93, 51, 51, 51, 51, 51, 51, 51, 51, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -9, -9, -9, -9, -9, -9, -9, -9, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, -9, -9, -9, -9, -9, -9, -9, -9, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 51, 51, 51, 51, 51, 51, 51, 51, 93, 93, 93, 93, 93, 93, 93, 93, -10,-10,-10,-10,-10,-10,-10,-10, },
- { -3, -3, -3, -3, -3, -3, -3, -3, 30, 30, 30, 30, 30, 30, 30, 30, 111,111,111,111,111,111,111,111, -10,-10,-10,-10,-10,-10,-10,-10, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 124,124,124,124,124,124,124,124, -7, -7, -7, -7, -7, -7, -7, -7, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 123,123,123,123,123,123,123,123, 13, 13, 13, 13, 13, 13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -11,-11,-11,-11,-11,-11,-11,-11, 112,112,112,112,112,112,112,112, 31, 31, 31, 31, 31, 31, 31, 31, -4, -4, -4, -4, -4, -4, -4, -4, },
- { -11,-11,-11,-11,-11,-11,-11,-11, 94, 94, 94, 94, 94, 94, 94, 94, 52, 52, 52, 52, 52, 52, 52, 52, -7, -7, -7, -7, -7, -7, -7, -7, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -10,-10,-10,-10,-10,-10,-10,-10, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 52, 52, 52, 52, 52, 52, 52, 52, 94, 94, 94, 94, 94, 94, 94, 94, -11,-11,-11,-11,-11,-11,-11,-11, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 31, 31, 31, 31, 31, 31, 31, 31, 112,112,112,112,112,112,112,112, -11,-11,-11,-11,-11,-11,-11,-11, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 13, 13, 13, 13, 13, 13, 13, 13, 123,123,123,123,123,123,123,123, -7, -7, -7, -7, -7, -7, -7, -7, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -8, -8, -8, -8, -8, -8, -8, -8, 124,124,124,124,124,124,124,124, 13, 13, 13, 13, 13, 13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -12,-12,-12,-12,-12,-12,-12,-12, 112,112,112,112,112,112,112,112, 32, 32, 32, 32, 32, 32, 32, 32, -4, -4, -4, -4, -4, -4, -4, -4, },
- { -12,-12,-12,-12,-12,-12,-12,-12, 94, 94, 94, 94, 94, 94, 94, 94, 53, 53, 53, 53, 53, 53, 53, 53, -7, -7, -7, -7, -7, -7, -7, -7, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -10,-10,-10,-10,-10,-10,-10,-10, },
- { -7, -7, -7, -7, -7, -7, -7, -7, 53, 53, 53, 53, 53, 53, 53, 53, 94, 94, 94, 94, 94, 94, 94, 94, -12,-12,-12,-12,-12,-12,-12,-12, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 32, 32, 32, 32, 32, 32, 32, 32, 112,112,112,112,112,112,112,112, -12,-12,-12,-12,-12,-12,-12,-12, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 13, 13, 13, 13, 13, 13, 13, 13, 124,124,124,124,124,124,124,124, -8, -8, -8, -8, -8, -8, -8, -8, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -9, -9, -9, -9, -9, -9, -9, -9, 124,124,124,124,124,124,124,124, 14, 14, 14, 14, 14, 14, 14, 14, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -13,-13,-13,-13,-13,-13,-13,-13, 112,112,112,112,112,112,112,112, 33, 33, 33, 33, 33, 33, 33, 33, -4, -4, -4, -4, -4, -4, -4, -4, },
- { -13,-13,-13,-13,-13,-13,-13,-13, 95, 95, 95, 95, 95, 95, 95, 95, 54, 54, 54, 54, 54, 54, 54, 54, -8, -8, -8, -8, -8, -8, -8, -8, },
- { -11,-11,-11,-11,-11,-11,-11,-11, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, -11,-11,-11,-11,-11,-11,-11,-11, },
- { -8, -8, -8, -8, -8, -8, -8, -8, 54, 54, 54, 54, 54, 54, 54, 54, 95, 95, 95, 95, 95, 95, 95, 95, -13,-13,-13,-13,-13,-13,-13,-13, },
- { -4, -4, -4, -4, -4, -4, -4, -4, 33, 33, 33, 33, 33, 33, 33, 33, 112,112,112,112,112,112,112,112, -13,-13,-13,-13,-13,-13,-13,-13, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 14, 14, 14, 14, 14, 14, 14, 14, 124,124,124,124,124,124,124,124, -9, -9, -9, -9, -9, -9, -9, -9, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -9, -9, -9, -9, -9, -9, -9, -9, 123,123,123,123,123,123,123,123, 15, 15, 15, 15, 15, 15, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -14,-14,-14,-14,-14,-14,-14,-14, 113,113,113,113,113,113,113,113, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -14,-14,-14,-14,-14,-14,-14,-14, 95, 95, 95, 95, 95, 95, 95, 95, 55, 55, 55, 55, 55, 55, 55, 55, -8, -8, -8, -8, -8, -8, -8, -8, },
- { -12,-12,-12,-12,-12,-12,-12,-12, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, -12,-12,-12,-12,-12,-12,-12,-12, },
- { -8, -8, -8, -8, -8, -8, -8, -8, 55, 55, 55, 55, 55, 55, 55, 55, 95, 95, 95, 95, 95, 95, 95, 95, -14,-14,-14,-14,-14,-14,-14,-14, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 112,112,112,112,112,112,112,112, -13,-13,-13,-13,-13,-13,-13,-13, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 15, 15, 15, 15, 15, 15, 15, 15, 123,123,123,123,123,123,123,123, -9, -9, -9, -9, -9, -9, -9, -9, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 124,124,124,124,124,124,124,124, 15, 15, 15, 15, 15, 15, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -14,-14,-14,-14,-14,-14,-14,-14, 113,113,113,113,113,113,113,113, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -15,-15,-15,-15,-15,-15,-15,-15, 96, 96, 96, 96, 96, 96, 96, 96, 56, 56, 56, 56, 56, 56, 56, 56, -9, -9, -9, -9, -9, -9, -9, -9, },
- { -13,-13,-13,-13,-13,-13,-13,-13, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, -13,-13,-13,-13,-13,-13,-13,-13, },
- { -9, -9, -9, -9, -9, -9, -9, -9, 56, 56, 56, 56, 56, 56, 56, 56, 96, 96, 96, 96, 96, 96, 96, 96, -15,-15,-15,-15,-15,-15,-15,-15, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 113,113,113,113,113,113,113,113, -14,-14,-14,-14,-14,-14,-14,-14, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 15, 15, 15, 15, 15, 15, 15, 15, 124,124,124,124,124,124,124,124, -10,-10,-10,-10,-10,-10,-10,-10, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 123,123,123,123,123,123,123,123, 16, 16, 16, 16, 16, 16, 16, 16, -1, -1, -1, -1, -1, -1, -1, -1, },
- { -15,-15,-15,-15,-15,-15,-15,-15, 113,113,113,113,113,113,113,113, 35, 35, 35, 35, 35, 35, 35, 35, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -16,-16,-16,-16,-16,-16,-16,-16, 98, 98, 98, 98, 98, 98, 98, 98, 56, 56, 56, 56, 56, 56, 56, 56, -10,-10,-10,-10,-10,-10,-10,-10, },
- { -14,-14,-14,-14,-14,-14,-14,-14, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, -14,-14,-14,-14,-14,-14,-14,-14, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 56, 56, 56, 56, 56, 56, 56, 56, 98, 98, 98, 98, 98, 98, 98, 98, -16,-16,-16,-16,-16,-16,-16,-16, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 35, 35, 35, 35, 35, 35, 35, 35, 113,113,113,113,113,113,113,113, -15,-15,-15,-15,-15,-15,-15,-15, },
- { -1, -1, -1, -1, -1, -1, -1, -1, 16, 16, 16, 16, 16, 16, 16, 16, 123,123,123,123,123,123,123,123, -10,-10,-10,-10,-10,-10,-10,-10, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -11,-11,-11,-11,-11,-11,-11,-11, 124,124,124,124,124,124,124,124, 17, 17, 17, 17, 17, 17, 17, 17, -2, -2, -2, -2, -2, -2, -2, -2, },
- { -16,-16,-16,-16,-16,-16,-16,-16, 113,113,113,113,113,113,113,113, 36, 36, 36, 36, 36, 36, 36, 36, -5, -5, -5, -5, -5, -5, -5, -5, },
- { -17,-17,-17,-17,-17,-17,-17,-17, 98, 98, 98, 98, 98, 98, 98, 98, 57, 57, 57, 57, 57, 57, 57, 57, -10,-10,-10,-10,-10,-10,-10,-10, },
- { -14,-14,-14,-14,-14,-14,-14,-14, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, -14,-14,-14,-14,-14,-14,-14,-14, },
- { -10,-10,-10,-10,-10,-10,-10,-10, 57, 57, 57, 57, 57, 57, 57, 57, 98, 98, 98, 98, 98, 98, 98, 98, -17,-17,-17,-17,-17,-17,-17,-17, },
- { -5, -5, -5, -5, -5, -5, -5, -5, 36, 36, 36, 36, 36, 36, 36, 36, 113,113,113,113,113,113,113,113, -16,-16,-16,-16,-16,-16,-16,-16, },
- { -2, -2, -2, -2, -2, -2, -2, -2, 17, 17, 17, 17, 17, 17, 17, 17, 124,124,124,124,124,124,124,124, -11,-11,-11,-11,-11,-11,-11,-11, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -12,-12,-12,-12,-12,-12,-12,-12, 125,125,125,125,125,125,125,125, 17, 17, 17, 17, 17, 17, 17, 17, -2, -2, -2, -2, -2, -2, -2, -2, },
- { -17,-17,-17,-17,-17,-17,-17,-17, 114,114,114,114,114,114,114,114, 37, 37, 37, 37, 37, 37, 37, 37, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -18,-18,-18,-18,-18,-18,-18,-18, 99, 99, 99, 99, 99, 99, 99, 99, 58, 58, 58, 58, 58, 58, 58, 58, -11,-11,-11,-11,-11,-11,-11,-11, },
- { -15,-15,-15,-15,-15,-15,-15,-15, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, -15,-15,-15,-15,-15,-15,-15,-15, },
- { -11,-11,-11,-11,-11,-11,-11,-11, 58, 58, 58, 58, 58, 58, 58, 58, 99, 99, 99, 99, 99, 99, 99, 99, -18,-18,-18,-18,-18,-18,-18,-18, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 37, 37, 37, 37, 37, 37, 37, 37, 114,114,114,114,114,114,114,114, -17,-17,-17,-17,-17,-17,-17,-17, },
- { -2, -2, -2, -2, -2, -2, -2, -2, 17, 17, 17, 17, 17, 17, 17, 17, 125,125,125,125,125,125,125,125, -12,-12,-12,-12,-12,-12,-12,-12, },
- },
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
- { -12,-12,-12,-12,-12,-12,-12,-12, 124,124,124,124,124,124,124,124, 18, 18, 18, 18, 18, 18, 18, 18, -2, -2, -2, -2, -2, -2, -2, -2, },
- { -18,-18,-18,-18,-18,-18,-18,-18, 114,114,114,114,114,114,114,114, 38, 38, 38, 38, 38, 38, 38, 38, -6, -6, -6, -6, -6, -6, -6, -6, },
- { -19,-19,-19,-19,-19,-19,-19,-19, 99, 99, 99, 99, 99, 99, 99, 99, 59, 59, 59, 59, 59, 59, 59, 59, -11,-11,-11,-11,-11,-11,-11,-11, },
- { -16,-16,-16,-16,-16,-16,-16,-16, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, -16,-16,-16,-16,-16,-16,-16,-16, },
- { -11,-11,-11,-11,-11,-11,-11,-11, 59, 59, 59, 59, 59, 59, 59, 59, 99, 99, 99, 99, 99, 99, 99, 99, -19,-19,-19,-19,-19,-19,-19,-19, },
- { -6, -6, -6, -6, -6, -6, -6, -6, 38, 38, 38, 38, 38, 38, 38, 38, 114,114,114,114,114,114,114,114, -18,-18,-18,-18,-18,-18,-18,-18, },
- { -2, -2, -2, -2, -2, -2, -2, -2, 18, 18, 18, 18, 18, 18, 18, 18, 124,124,124,124,124,124,124,124, -12,-12,-12,-12,-12,-12,-12,-12, },
- },
- // Dummy entry for VP61 supporty
- {
- { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
- { -4, -4, -4, -4, -4, -4, -4, -4, 118,118,118,118,118,118,118,118, 16, 16, 16, 16, 16, 16, 16, 16, -2, -2, -2, -2, -2, -2, -2, -2 },
- { -7, -7, -7, -7, -7, -7, -7, -7, 106,106,106,106,106,106,106,106, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5 },
- { -8, -8, -8, -8, -8, -8, -8, -8, 90, 90, 90, 90, 90, 90, 90, 90, 53, 53, 53, 53, 53, 53, 53, 53, -7, -7, -7, -7, -7, -7, -7, -7 },
- { -8, -8, -8, -8, -8, -8, -8, -8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, -8, -8, -8, -8, -8, -8, -8, -8 },
- { -7, -7, -7, -7, -7, -7, -7, -7, 53, 53, 53, 53, 53, 53, 53, 53, 90, 90, 90, 90, 90, 90, 90, 90, -8, -8, -8, -8, -8, -8, -8, -8 },
- { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 106,106,106,106,106,106,106,106, -7, -7, -7, -7, -7, -7, -7, -7 },
- { -2, -2, -2, -2, -2, -2, -2, -2, 16, 16, 16, 16, 16, 16, 16, 16, 118,118,118,118,118,118,118,118, -4, -4, -4, -4, -4, -4, -4, -4 }
- }
- };
- void FilterBlock1d_h_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
- {
- __asm
- {
- mov edi, Filter
- movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
- movq mm2, [edi+ 16] ; mm3 *= kernel 0 modifiers.
- movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
- movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
- mov edi,OutputPtr
- mov esi,SrcPtr
- dec esi
- mov ecx, DWORD PTR OutputHeight
- mov eax, OutputWidth ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [esi] ; mm3 = p-1..p6
- movq mm4, mm3 ; mm4 = p-1..p6
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- psrlq mm4, 24 ; mm4 = p2..p6
- movq mm5, mm4 ; mm5 = p2..p6
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- movq mm4, [esi+1] ; mm4 = p0..p6
- movq mm5, mm4 ; mm5 = p0..p6
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- psrlq mm4, 8 ; mm4 = p1..p6
- movq mm5, mm4 ; mm5 = p1..p6
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- paddsw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi],mm3 ; store the results in the destination
- movq mm3, [esi+4] ; mm3 = p-1..p6
- movq mm4, mm3 ; mm4 = p-1..p6
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- psrlq mm4, 24 ; mm4 = p2..p6
- movq mm5, mm4 ; mm5 = p2..p6
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- movq mm4, [esi+5] ; mm4 = p0..p6
- movq mm5, mm4 ; mm5 = p0..p6
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- psrlq mm4, 8 ; mm4 = p1..p6
- movq mm5, mm4 ; mm5 = p1..p6
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- paddsw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi+4],mm3 ; store the results in the destination
- add esi,SrcPixelsPerLine ; next line
- add edi,eax;
- dec ecx ; decrement count
- jnz nextrow ; next row
- }
- }
- void FilterBlock1d_v_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
- {
- __asm
- {
- mov edi, Filter
- movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
- movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
- movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
- movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
- mov edx, PixelsPerLine
- mov edi, OutputPtr
- mov esi, SrcPtr
- sub esi, PixelsPerLine
- mov ecx, DWORD PTR OutputHeight
- mov eax, OutputWidth ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [esi] ; mm3 = p0..p8
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- add esi, edx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [esi+2*edx] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [esi ] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [esi +edx] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- paddsw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
- movd [edi],mm3 ; store the results in the destination
-
- sub esi, edx ; subtract edx to get back to -1 column
- movq mm3, [esi+4] ; mm3 = p4..p12
- punpcklbw mm3, mm0 ; mm3 = p4..p7
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- add esi, edx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [esi+2*edx+4] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [esi +4] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [esi +edx+4] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- paddsw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
- movd [edi+4],mm3 ; store the results in the destination
- // the subsequent iterations repeat 3 out of 4 of these reads. Since the
- // recon block should be in cache this shouldn't cost much. Its obviously
- // avoidable!!!.
- add edi,eax;
- dec ecx ; decrement count
- jnz nextrow ; next row
- }
- }
- void FilterBlock1d_h_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
- {
- __asm
- {
- mov edi, Filter
- movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
- movq mm2, [edi+ 16] ; mm3 *= kernel 0 modifiers.
- movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
- movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
- mov edi,OutputPtr
- mov esi,SrcPtr
- dec esi
- mov ecx, DWORD PTR OutputHeight
- mov eax, OutputWidth ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [esi] ; mm3 = p-1..p6
- movq mm4, mm3 ; mm4 = p-1..p6
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- psrlq mm4, 8 ; mm4 = p0..p6
- movq mm5, mm4 ; mm5 = p0..p6
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- psrlq mm4, 8 ; mm4 = p1..p6
- movq mm5, mm4 ; mm5 = p1..p6
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- psrlq mm4, 8 ; mm4 = p2..p6
- movq mm5, mm4 ; mm5 = p2..p6
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi],mm3 ; store the results in the destination
- movq mm3, [esi+4] ; mm3 = p-1..p6
- movq mm4, mm3 ; mm4 = p-1..p6
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- psrlq mm4, 8 ; mm4 = p0..p6
- movq mm5, mm4 ; mm5 = p0..p6
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- psrlq mm4, 8 ; mm4 = p1..p6
- movq mm5, mm4 ; mm5 = p1..p6
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- psrlq mm4, 8 ; mm4 = p2..p6
- movq mm5, mm4 ; mm5 = p2..p6
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi+4],mm3 ; store the results in the destination
- add esi,SrcPixelsPerLine ; next line
- add edi,eax;
- dec ecx ; decrement count
- jnz nextrow ; next row
- }
- }
- void FilterBlock1d_v_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
- {
- __asm
- {
- mov edi, Filter
- movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
- movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
- movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
- movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
- mov edx, PixelsPerLine
- mov edi, OutputPtr
- mov esi, SrcPtr
- sub esi, PixelsPerLine
- mov ecx, DWORD PTR OutputHeight
- mov eax, OutputWidth ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [esi] ; mm3 = p0..p8
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- movq mm4, [esi +edx ] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- movq mm4, [esi +2*edx] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- add esi, edx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [esi+2*edx] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
- movd [edi],mm3 ; store the results in the destination
-
- sub esi, edx ; subtract edx to get back to -1 column
- movq mm3, [esi+4] ; mm3 = p4..p12
- punpcklbw mm3, mm0 ; mm3 = p4..p7
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- movq mm4, [esi +edx +4] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- movq mm4, [esi +2*edx+4] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- add esi, edx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [esi+2*edx+4] ; mm4 = p0..p8
- punpcklbw mm4, mm0 ; mm4 = p0..p3
- pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
- movd [edi+4],mm3 ; store the results in the destination
- // the subsequent iterations repeat 3 out of 4 of these reads. Since the
- // recon block should be in cache this shouldn't cost much. Its obviously
- // avoidable!!!.
- add edi,eax;
- dec ecx ; decrement count
- jnz nextrow ; next row
- }
- }
- void FilterBlock1d_hb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
- {
- __asm
- {
- mov edi, Filter
- movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
- movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
- mov edi,OutputPtr
- mov esi,SrcPtr
- mov ecx, DWORD PTR OutputHeight
- mov eax, OutputWidth ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [esi] ; mm3 = p-1..p14
- movq mm4, mm3 ; mm4 = p-1..p14
- punpcklbw mm3, mm0 ; mm3 = p-1..p6
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- psrlq mm4, 8 ; mm4 = p0..p13
- movq mm5, mm4 ; mm5 = p0..p13
- punpcklbw mm5, mm0 ; mm5 = p0..p7
- pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi],mm3 ; store the results in the destination
- movq mm3, [esi+4] ; mm3 = p-1..p14
- movq mm4, mm3 ; mm4 = p-1..p14
- punpcklbw mm3, mm0 ; mm3 = p-1..p6
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- psrlq mm4, 8 ; mm4 = p0..p13
- movq mm5, mm4 ; mm5 = p0..p13
- punpcklbw mm5, mm0 ; mm5 = p0..p7
- pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
- paddw mm3, mm5 ; mm3 += mm5
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi+4],mm3 ; store the results in the destination
- add esi,SrcPixelsPerLine ; next line
- add edi,eax;
- dec ecx ; decrement count
- jnz nextrow ; next row
- }
- }
- void FilterBlock1d_vb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
- {
- __asm
- {
- mov edi, Filter
- movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
- movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
- mov edx, PixelsPerLine
- mov edi, OutputPtr
- mov esi, SrcPtr
- mov ecx, DWORD PTR OutputHeight
- mov eax, OutputWidth ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [esi] ; mm3 = p0..p16
- punpcklbw mm3, mm0 ; mm3 = p0..p8
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- movq mm4, [esi +edx ] ; mm4 = p0..p16
- punpcklbw mm4, mm0 ; mm4 = p0..p8
- pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi],mm3 ; store the results in the destination
- movq mm3, [esi+4] ; mm3 = p0..p16
- punpcklbw mm3, mm0 ; mm3 = p0..p8
- pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
- movq mm4, [esi +edx +4] ; mm4 = p0..p16
- punpcklbw mm4, mm0 ; mm4 = p0..p8
- pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
- paddw mm3, mm4 ; mm3 += mm4
- paddw mm3, rd ; mm3 += round value
- psraw mm3, FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [edi+4],mm3 ; store the results in the destination
- // the subsequent iterations repeat 3 out of 4 of these reads. Since the
- // recon block should be in cache this shouldn't cost much. Its obviously
- // avoidable!!!.
- add esi,edx
- add edi,eax
- dec ecx ; decrement count
- jnz nextrow ; next row
- }
- }
-
- /****************************************************************************
- *
- * ROUTINE : FilterBlock2dBil
- *
- * INPUTS : Pointer to source data
- *
- * OUTPUTS : Filtered data
- *
- * RETURNS : None.
- *
- * FUNCTION : Applies a bilinear filter on the intput data to produce
- * a predictor block (UINT16)
- *
- * SPECIAL NOTES :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- _inline
- void FilterBlock2dBil_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
- {
- __asm
- {
- mov eax, HFilter ;
- mov edi, OutputPtr ;
- mov esi, SrcPtr ;
- lea ecx, [edi+64] ;
- mov edx, SrcPixelsPerLine ;
-
- movq mm1, [eax] ;
- movq mm2, [eax+16] ;
-
- mov eax, VFilter ;
- pxor mm0, mm0 ;
- // get the first horizontal line done ;
- movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [esi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- paddw mm3, rd ; xmm3 += round value
- psraw mm3, FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, rd ;
- psraw mm4, FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- add esi, edx ; next line
- NextRow:
- movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [esi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- movq mm5, mm7 ;
- movq mm6, mm7 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
- pmullw mm5, [eax] ;
- pmullw mm6, [eax] ;
-
- paddw mm3, rd ; xmm3 += round value
- psraw mm3, FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, rd ;
- psraw mm4, FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
- pmullw mm3, [eax+16] ;
- pmullw mm4, [eax+16] ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
-
- paddw mm3, rd ; xmm3 += round value
- psraw mm3, FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, rd ;
- psraw mm4, FILTER_SHIFT ;
-
- packuswb mm3, mm4
- movq [edi], mm3 ; store the results in the destination
- add esi, edx ; next line
- add edi, 8 ;
- cmp edi, ecx ;
- jne NextRow
- }
- // First filter 1d Horizontal
- //FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
- // Now filter Verticaly
- //FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
- }
-
- /****************************************************************************
- *
- * ROUTINE : FilterBlockBil_8
- *
- * INPUTS : ReconPtr1, ReconPtr12
- * Two pointers into the block of data to be filtered
- * These pointers bound the fractional pel position
- * PixelsPerLine
- * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
- * Modx, ModY
- * The fractional pel bits used to select a filter.
- *
- *
- * OUTPUTS : ReconRefPtr
- * A pointer to an 8x8 buffer into which UINT8 filtered data is written.
- *
- * RETURNS : None.
- *
- * FUNCTION : Produces a bilinear filtered fractional pel prediction block
- * with UINT8 output
- *
- * SPECIAL NOTES :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
- {
- int diff;
- // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
- diff=ReconPtr2-ReconPtr1;
- // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
- // This works out to be what we want... despite the pointer swapping that goes on below.
- // For example... if the X component of the vector is a +ve ModX = X%8.
- // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
- if(diff<0)
- { // swap pointers so ReconPtr1 smaller
- UINT8 *temp=ReconPtr1;
- ReconPtr1=ReconPtr2;
- ReconPtr2=temp;
- diff= (int)(ReconPtr2-ReconPtr1);
- }
- if( diff==1 )
- {
- FilterBlock1d_hb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] );
- }
- else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
- {
- FilterBlock1d_vb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]);
- }
- else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
- {
- FilterBlock2dBil_mmx( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
- }
- else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
- {
- FilterBlock2dBil_mmx( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
- }
- }
- /****************************************************************************
- *
- * ROUTINE : FilterBlock2d
- *
- * INPUTS : Pointer to source data
- *
- * OUTPUTS : Filtered data
- *
- * RETURNS : None.
- *
- * FUNCTION : Applies a 2d 4 tap filter on the intput data to produce
- * a predictor block (UINT16)
- *
- * SPECIAL NOTES :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void FilterBlock2d_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
- {
- UINT8 Intermediate[256];
- // First filter 1d Horizontal
- FilterBlock1d_h_mmx(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );
- // Now filter Verticaly
- FilterBlock1d_v_mmx(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
- }
-
- /****************************************************************************
- *
- * ROUTINE : FilterBlock
- *
- * INPUTS : ReconPtr1, ReconPtr12
- * Two pointers into the block of data to be filtered
- * These pointers bound the fractional pel position
- * PixelsPerLine
- * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
- * Modx, ModY
- * The fractional pel bits used to select a filter.
- * UseBicubic
- * Whether to use the bicubuc filter set or the bilinear set
- *
- *
- * OUTPUTS : ReconRefPtr
- * A pointer to an 8x8 buffer into which the filtered data is written.
- *
- * RETURNS : None.
- *
- * FUNCTION : Produces a filtered fractional pel prediction block
- * using bilinear or bicubic filters
- *
- * SPECIAL NOTES :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
- {
- int diff;
- UINT8 Intermediate[256];
- // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
- diff=ReconPtr2-ReconPtr1;
- // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
- // This works out to be what we want... despite the pointer swapping that goes on below.
- // For example... if the X component of the vector is a +ve ModX = X%8.
- // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
- if(diff<0)
- { // swap pointers so ReconPtr1 smaller
- UINT8 *temp=ReconPtr1;
- ReconPtr1=ReconPtr2;
- ReconPtr2=temp;
- diff= (int)(ReconPtr2-ReconPtr1);
- }
- if(!diff)
- {
- return;
- }
- if( diff==1 )
- { // Fractional pixel in horizontal only
- if ( UseBicubic )
- FilterBlock1d_h_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
- else
- FilterBlock1d_hb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] );
- }
- else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
- {
- if ( UseBicubic )
- FilterBlock1d_v_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
- else
- FilterBlock1d_vb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]);
- }
- else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
- {
- if ( UseBicubic )
- FilterBlock2d_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
- else
- FilterBlock2dBil_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
- }
- else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
- {
- if ( UseBicubic )
- FilterBlock2d_mmx( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
- else
- FilterBlock2dBil_mmx( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
- }
- UnpackBlock_MMX( Intermediate, ReconRefPtr, 8 );
- }
|