filtmmx.c 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053
  1. /****************************************************************************
  2. *
  3. * Module Title : newLoopTest_asm.c
  4. *
  5. * Description : Codec specific functions
  6. *
  7. * AUTHOR : Yaowu Xu
  8. *
  9. *****************************************************************************
  10. * Revision History
  11. *
  12. * 1.02 YWX 03-Nov-00 Changed confusing variable name
  13. * 1.01 YWX 02-Nov-00 Added the set of functions
  14. * 1.00 YWX 19-Oct-00 configuration baseline
  15. *****************************************************************************
  16. */
  17. /****************************************************************************
  18. * Header Frames
  19. *****************************************************************************
  20. */
  21. #define STRICT /* Strict type checking. */
  22. #include "codec_common.h"
  23. #include <math.h>
  24. /****************************************************************************
  25. * Module constants.
  26. *****************************************************************************
  27. */
  28. #define MIN(a, b) (((a) < (b)) ? (a) : (b))
  29. #define FILTER_WEIGHT 128
  30. #define FILTER_SHIFT 7
  31. extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
  32. static __declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};
  33. __declspec(align(16)) INT16 BilinearFilters_mmx[8][16] =
  34. {
  35. { 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0 },
  36. { 112,112,112,112,112,112,112,112, 16, 16, 16, 16, 16, 16, 16, 16 },
  37. { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
  38. { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
  39. { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
  40. { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
  41. { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
  42. { 16, 16, 16, 16, 16, 16, 16, 16, 112,112,112,112,112,112,112,112 }
  43. };
  44. __declspec(align(16)) INT16 BicubicFilters_mmx[17][8][32] =
  45. {
  46. {
  47. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  48. { -3, -3, -3, -3, -3, -3, -3, -3, 122,122,122,122,122,122,122,122, 9, 9, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, },
  49. { -4, -4, -4, -4, -4, -4, -4, -4, 109,109,109,109,109,109,109,109, 24, 24, 24, 24, 24, 24, 24, 24, -1, -1, -1, -1, -1, -1, -1, -1, },
  50. { -5, -5, -5, -5, -5, -5, -5, -5, 91, 91, 91, 91, 91, 91, 91, 91, 45, 45, 45, 45, 45, 45, 45, 45, -3, -3, -3, -3, -3, -3, -3, -3, },
  51. { -4, -4, -4, -4, -4, -4, -4, -4, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, -4, -4, -4, -4, -4, -4, -4, -4, },
  52. { -3, -3, -3, -3, -3, -3, -3, -3, 45, 45, 45, 45, 45, 45, 45, 45, 91, 91, 91, 91, 91, 91, 91, 91, -5, -5, -5, -5, -5, -5, -5, -5, },
  53. { -1, -1, -1, -1, -1, -1, -1, -1, 24, 24, 24, 24, 24, 24, 24, 24, 109,109,109,109,109,109,109,109, -4, -4, -4, -4, -4, -4, -4, -4, },
  54. { 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 122,122,122,122,122,122,122,122, -3, -3, -3, -3, -3, -3, -3, -3, },
  55. },
  56. {
  57. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  58. { -4, -4, -4, -4, -4, -4, -4, -4, 124,124,124,124,124,124,124,124, 9, 9, 9, 9, 9, 9, 9, 9, -1, -1, -1, -1, -1, -1, -1, -1, },
  59. { -5, -5, -5, -5, -5, -5, -5, -5, 110,110,110,110,110,110,110,110, 25, 25, 25, 25, 25, 25, 25, 25, -2, -2, -2, -2, -2, -2, -2, -2, },
  60. { -6, -6, -6, -6, -6, -6, -6, -6, 91, 91, 91, 91, 91, 91, 91, 91, 46, 46, 46, 46, 46, 46, 46, 46, -3, -3, -3, -3, -3, -3, -3, -3, },
  61. { -5, -5, -5, -5, -5, -5, -5, -5, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, -5, -5, -5, -5, -5, -5, -5, -5, },
  62. { -3, -3, -3, -3, -3, -3, -3, -3, 46, 46, 46, 46, 46, 46, 46, 46, 91, 91, 91, 91, 91, 91, 91, 91, -6, -6, -6, -6, -6, -6, -6, -6, },
  63. { -2, -2, -2, -2, -2, -2, -2, -2, 25, 25, 25, 25, 25, 25, 25, 25, 110,110,110,110,110,110,110,110, -5, -5, -5, -5, -5, -5, -5, -5, },
  64. { -1, -1, -1, -1, -1, -1, -1, -1, 9, 9, 9, 9, 9, 9, 9, 9, 124,124,124,124,124,124,124,124, -4, -4, -4, -4, -4, -4, -4, -4, },
  65. },
  66. {
  67. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  68. { -4, -4, -4, -4, -4, -4, -4, -4, 123,123,123,123,123,123,123,123, 10, 10, 10, 10, 10, 10, 10, 10, -1, -1, -1, -1, -1, -1, -1, -1, },
  69. { -6, -6, -6, -6, -6, -6, -6, -6, 110,110,110,110,110,110,110,110, 26, 26, 26, 26, 26, 26, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, },
  70. { -7, -7, -7, -7, -7, -7, -7, -7, 92, 92, 92, 92, 92, 92, 92, 92, 47, 47, 47, 47, 47, 47, 47, 47, -4, -4, -4, -4, -4, -4, -4, -4, },
  71. { -6, -6, -6, -6, -6, -6, -6, -6, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -6, -6, -6, -6, -6, -6, -6, -6, },
  72. { -4, -4, -4, -4, -4, -4, -4, -4, 47, 47, 47, 47, 47, 47, 47, 47, 92, 92, 92, 92, 92, 92, 92, 92, -7, -7, -7, -7, -7, -7, -7, -7, },
  73. { -2, -2, -2, -2, -2, -2, -2, -2, 26, 26, 26, 26, 26, 26, 26, 26, 110,110,110,110,110,110,110,110, -6, -6, -6, -6, -6, -6, -6, -6, },
  74. { -1, -1, -1, -1, -1, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 123,123,123,123,123,123,123,123, -4, -4, -4, -4, -4, -4, -4, -4, },
  75. },
  76. {
  77. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  78. { -5, -5, -5, -5, -5, -5, -5, -5, 124,124,124,124,124,124,124,124, 10, 10, 10, 10, 10, 10, 10, 10, -1, -1, -1, -1, -1, -1, -1, -1, },
  79. { -7, -7, -7, -7, -7, -7, -7, -7, 110,110,110,110,110,110,110,110, 27, 27, 27, 27, 27, 27, 27, 27, -2, -2, -2, -2, -2, -2, -2, -2, },
  80. { -7, -7, -7, -7, -7, -7, -7, -7, 91, 91, 91, 91, 91, 91, 91, 91, 48, 48, 48, 48, 48, 48, 48, 48, -4, -4, -4, -4, -4, -4, -4, -4, },
  81. { -6, -6, -6, -6, -6, -6, -6, -6, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -6, -6, -6, -6, -6, -6, -6, -6, },
  82. { -4, -4, -4, -4, -4, -4, -4, -4, 48, 48, 48, 48, 48, 48, 48, 48, 92, 92, 92, 92, 92, 92, 92, 92, -8, -8, -8, -8, -8, -8, -8, -8, },
  83. { -2, -2, -2, -2, -2, -2, -2, -2, 27, 27, 27, 27, 27, 27, 27, 27, 110,110,110,110,110,110,110,110, -7, -7, -7, -7, -7, -7, -7, -7, },
  84. { -1, -1, -1, -1, -1, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 124,124,124,124,124,124,124,124, -5, -5, -5, -5, -5, -5, -5, -5, },
  85. },
  86. {
  87. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  88. { -6, -6, -6, -6, -6, -6, -6, -6, 124,124,124,124,124,124,124,124, 11, 11, 11, 11, 11, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1, },
  89. { -8, -8, -8, -8, -8, -8, -8, -8, 111,111,111,111,111,111,111,111, 28, 28, 28, 28, 28, 28, 28, 28, -3, -3, -3, -3, -3, -3, -3, -3, },
  90. { -8, -8, -8, -8, -8, -8, -8, -8, 92, 92, 92, 92, 92, 92, 92, 92, 49, 49, 49, 49, 49, 49, 49, 49, -5, -5, -5, -5, -5, -5, -5, -5, },
  91. { -7, -7, -7, -7, -7, -7, -7, -7, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, -7, -7, -7, -7, -7, -7, -7, -7, },
  92. { -5, -5, -5, -5, -5, -5, -5, -5, 49, 49, 49, 49, 49, 49, 49, 49, 92, 92, 92, 92, 92, 92, 92, 92, -8, -8, -8, -8, -8, -8, -8, -8, },
  93. { -3, -3, -3, -3, -3, -3, -3, -3, 28, 28, 28, 28, 28, 28, 28, 28, 111,111,111,111,111,111,111,111, -8, -8, -8, -8, -8, -8, -8, -8, },
  94. { -1, -1, -1, -1, -1, -1, -1, -1, 11, 11, 11, 11, 11, 11, 11, 11, 124,124,124,124,124,124,124,124, -6, -6, -6, -6, -6, -6, -6, -6, },
  95. },
  96. {
  97. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  98. { -6, -6, -6, -6, -6, -6, -6, -6, 123,123,123,123,123,123,123,123, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, },
  99. { -9, -9, -9, -9, -9, -9, -9, -9, 111,111,111,111,111,111,111,111, 29, 29, 29, 29, 29, 29, 29, 29, -3, -3, -3, -3, -3, -3, -3, -3, },
  100. { -9, -9, -9, -9, -9, -9, -9, -9, 93, 93, 93, 93, 93, 93, 93, 93, 50, 50, 50, 50, 50, 50, 50, 50, -6, -6, -6, -6, -6, -6, -6, -6, },
  101. { -8, -8, -8, -8, -8, -8, -8, -8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, -8, -8, -8, -8, -8, -8, -8, -8, },
  102. { -6, -6, -6, -6, -6, -6, -6, -6, 50, 50, 50, 50, 50, 50, 50, 50, 93, 93, 93, 93, 93, 93, 93, 93, -9, -9, -9, -9, -9, -9, -9, -9, },
  103. { -3, -3, -3, -3, -3, -3, -3, -3, 29, 29, 29, 29, 29, 29, 29, 29, 111,111,111,111,111,111,111,111, -9, -9, -9, -9, -9, -9, -9, -9, },
  104. { -1, -1, -1, -1, -1, -1, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 123,123,123,123,123,123,123,123, -6, -6, -6, -6, -6, -6, -6, -6, },
  105. },
  106. {
  107. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  108. { -7, -7, -7, -7, -7, -7, -7, -7, 124,124,124,124,124,124,124,124, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, },
  109. { -10,-10,-10,-10,-10,-10,-10,-10, 111,111,111,111,111,111,111,111, 30, 30, 30, 30, 30, 30, 30, 30, -3, -3, -3, -3, -3, -3, -3, -3, },
  110. { -10,-10,-10,-10,-10,-10,-10,-10, 93, 93, 93, 93, 93, 93, 93, 93, 51, 51, 51, 51, 51, 51, 51, 51, -6, -6, -6, -6, -6, -6, -6, -6, },
  111. { -9, -9, -9, -9, -9, -9, -9, -9, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, -9, -9, -9, -9, -9, -9, -9, -9, },
  112. { -6, -6, -6, -6, -6, -6, -6, -6, 51, 51, 51, 51, 51, 51, 51, 51, 93, 93, 93, 93, 93, 93, 93, 93, -10,-10,-10,-10,-10,-10,-10,-10, },
  113. { -3, -3, -3, -3, -3, -3, -3, -3, 30, 30, 30, 30, 30, 30, 30, 30, 111,111,111,111,111,111,111,111, -10,-10,-10,-10,-10,-10,-10,-10, },
  114. { -1, -1, -1, -1, -1, -1, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 124,124,124,124,124,124,124,124, -7, -7, -7, -7, -7, -7, -7, -7, },
  115. },
  116. {
  117. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  118. { -7, -7, -7, -7, -7, -7, -7, -7, 123,123,123,123,123,123,123,123, 13, 13, 13, 13, 13, 13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, },
  119. { -11,-11,-11,-11,-11,-11,-11,-11, 112,112,112,112,112,112,112,112, 31, 31, 31, 31, 31, 31, 31, 31, -4, -4, -4, -4, -4, -4, -4, -4, },
  120. { -11,-11,-11,-11,-11,-11,-11,-11, 94, 94, 94, 94, 94, 94, 94, 94, 52, 52, 52, 52, 52, 52, 52, 52, -7, -7, -7, -7, -7, -7, -7, -7, },
  121. { -10,-10,-10,-10,-10,-10,-10,-10, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -10,-10,-10,-10,-10,-10,-10,-10, },
  122. { -7, -7, -7, -7, -7, -7, -7, -7, 52, 52, 52, 52, 52, 52, 52, 52, 94, 94, 94, 94, 94, 94, 94, 94, -11,-11,-11,-11,-11,-11,-11,-11, },
  123. { -4, -4, -4, -4, -4, -4, -4, -4, 31, 31, 31, 31, 31, 31, 31, 31, 112,112,112,112,112,112,112,112, -11,-11,-11,-11,-11,-11,-11,-11, },
  124. { -1, -1, -1, -1, -1, -1, -1, -1, 13, 13, 13, 13, 13, 13, 13, 13, 123,123,123,123,123,123,123,123, -7, -7, -7, -7, -7, -7, -7, -7, },
  125. },
  126. {
  127. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  128. { -8, -8, -8, -8, -8, -8, -8, -8, 124,124,124,124,124,124,124,124, 13, 13, 13, 13, 13, 13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, },
  129. { -12,-12,-12,-12,-12,-12,-12,-12, 112,112,112,112,112,112,112,112, 32, 32, 32, 32, 32, 32, 32, 32, -4, -4, -4, -4, -4, -4, -4, -4, },
  130. { -12,-12,-12,-12,-12,-12,-12,-12, 94, 94, 94, 94, 94, 94, 94, 94, 53, 53, 53, 53, 53, 53, 53, 53, -7, -7, -7, -7, -7, -7, -7, -7, },
  131. { -10,-10,-10,-10,-10,-10,-10,-10, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -10,-10,-10,-10,-10,-10,-10,-10, },
  132. { -7, -7, -7, -7, -7, -7, -7, -7, 53, 53, 53, 53, 53, 53, 53, 53, 94, 94, 94, 94, 94, 94, 94, 94, -12,-12,-12,-12,-12,-12,-12,-12, },
  133. { -4, -4, -4, -4, -4, -4, -4, -4, 32, 32, 32, 32, 32, 32, 32, 32, 112,112,112,112,112,112,112,112, -12,-12,-12,-12,-12,-12,-12,-12, },
  134. { -1, -1, -1, -1, -1, -1, -1, -1, 13, 13, 13, 13, 13, 13, 13, 13, 124,124,124,124,124,124,124,124, -8, -8, -8, -8, -8, -8, -8, -8, },
  135. },
  136. {
  137. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  138. { -9, -9, -9, -9, -9, -9, -9, -9, 124,124,124,124,124,124,124,124, 14, 14, 14, 14, 14, 14, 14, 14, -1, -1, -1, -1, -1, -1, -1, -1, },
  139. { -13,-13,-13,-13,-13,-13,-13,-13, 112,112,112,112,112,112,112,112, 33, 33, 33, 33, 33, 33, 33, 33, -4, -4, -4, -4, -4, -4, -4, -4, },
  140. { -13,-13,-13,-13,-13,-13,-13,-13, 95, 95, 95, 95, 95, 95, 95, 95, 54, 54, 54, 54, 54, 54, 54, 54, -8, -8, -8, -8, -8, -8, -8, -8, },
  141. { -11,-11,-11,-11,-11,-11,-11,-11, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, -11,-11,-11,-11,-11,-11,-11,-11, },
  142. { -8, -8, -8, -8, -8, -8, -8, -8, 54, 54, 54, 54, 54, 54, 54, 54, 95, 95, 95, 95, 95, 95, 95, 95, -13,-13,-13,-13,-13,-13,-13,-13, },
  143. { -4, -4, -4, -4, -4, -4, -4, -4, 33, 33, 33, 33, 33, 33, 33, 33, 112,112,112,112,112,112,112,112, -13,-13,-13,-13,-13,-13,-13,-13, },
  144. { -1, -1, -1, -1, -1, -1, -1, -1, 14, 14, 14, 14, 14, 14, 14, 14, 124,124,124,124,124,124,124,124, -9, -9, -9, -9, -9, -9, -9, -9, },
  145. },
  146. {
  147. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  148. { -9, -9, -9, -9, -9, -9, -9, -9, 123,123,123,123,123,123,123,123, 15, 15, 15, 15, 15, 15, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, },
  149. { -14,-14,-14,-14,-14,-14,-14,-14, 113,113,113,113,113,113,113,113, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5, },
  150. { -14,-14,-14,-14,-14,-14,-14,-14, 95, 95, 95, 95, 95, 95, 95, 95, 55, 55, 55, 55, 55, 55, 55, 55, -8, -8, -8, -8, -8, -8, -8, -8, },
  151. { -12,-12,-12,-12,-12,-12,-12,-12, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, -12,-12,-12,-12,-12,-12,-12,-12, },
  152. { -8, -8, -8, -8, -8, -8, -8, -8, 55, 55, 55, 55, 55, 55, 55, 55, 95, 95, 95, 95, 95, 95, 95, 95, -14,-14,-14,-14,-14,-14,-14,-14, },
  153. { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 112,112,112,112,112,112,112,112, -13,-13,-13,-13,-13,-13,-13,-13, },
  154. { -1, -1, -1, -1, -1, -1, -1, -1, 15, 15, 15, 15, 15, 15, 15, 15, 123,123,123,123,123,123,123,123, -9, -9, -9, -9, -9, -9, -9, -9, },
  155. },
  156. {
  157. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  158. { -10,-10,-10,-10,-10,-10,-10,-10, 124,124,124,124,124,124,124,124, 15, 15, 15, 15, 15, 15, 15, 15, -1, -1, -1, -1, -1, -1, -1, -1, },
  159. { -14,-14,-14,-14,-14,-14,-14,-14, 113,113,113,113,113,113,113,113, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5, },
  160. { -15,-15,-15,-15,-15,-15,-15,-15, 96, 96, 96, 96, 96, 96, 96, 96, 56, 56, 56, 56, 56, 56, 56, 56, -9, -9, -9, -9, -9, -9, -9, -9, },
  161. { -13,-13,-13,-13,-13,-13,-13,-13, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, -13,-13,-13,-13,-13,-13,-13,-13, },
  162. { -9, -9, -9, -9, -9, -9, -9, -9, 56, 56, 56, 56, 56, 56, 56, 56, 96, 96, 96, 96, 96, 96, 96, 96, -15,-15,-15,-15,-15,-15,-15,-15, },
  163. { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 113,113,113,113,113,113,113,113, -14,-14,-14,-14,-14,-14,-14,-14, },
  164. { -1, -1, -1, -1, -1, -1, -1, -1, 15, 15, 15, 15, 15, 15, 15, 15, 124,124,124,124,124,124,124,124, -10,-10,-10,-10,-10,-10,-10,-10, },
  165. },
  166. {
  167. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  168. { -10,-10,-10,-10,-10,-10,-10,-10, 123,123,123,123,123,123,123,123, 16, 16, 16, 16, 16, 16, 16, 16, -1, -1, -1, -1, -1, -1, -1, -1, },
  169. { -15,-15,-15,-15,-15,-15,-15,-15, 113,113,113,113,113,113,113,113, 35, 35, 35, 35, 35, 35, 35, 35, -5, -5, -5, -5, -5, -5, -5, -5, },
  170. { -16,-16,-16,-16,-16,-16,-16,-16, 98, 98, 98, 98, 98, 98, 98, 98, 56, 56, 56, 56, 56, 56, 56, 56, -10,-10,-10,-10,-10,-10,-10,-10, },
  171. { -14,-14,-14,-14,-14,-14,-14,-14, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, -14,-14,-14,-14,-14,-14,-14,-14, },
  172. { -10,-10,-10,-10,-10,-10,-10,-10, 56, 56, 56, 56, 56, 56, 56, 56, 98, 98, 98, 98, 98, 98, 98, 98, -16,-16,-16,-16,-16,-16,-16,-16, },
  173. { -5, -5, -5, -5, -5, -5, -5, -5, 35, 35, 35, 35, 35, 35, 35, 35, 113,113,113,113,113,113,113,113, -15,-15,-15,-15,-15,-15,-15,-15, },
  174. { -1, -1, -1, -1, -1, -1, -1, -1, 16, 16, 16, 16, 16, 16, 16, 16, 123,123,123,123,123,123,123,123, -10,-10,-10,-10,-10,-10,-10,-10, },
  175. },
  176. {
  177. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  178. { -11,-11,-11,-11,-11,-11,-11,-11, 124,124,124,124,124,124,124,124, 17, 17, 17, 17, 17, 17, 17, 17, -2, -2, -2, -2, -2, -2, -2, -2, },
  179. { -16,-16,-16,-16,-16,-16,-16,-16, 113,113,113,113,113,113,113,113, 36, 36, 36, 36, 36, 36, 36, 36, -5, -5, -5, -5, -5, -5, -5, -5, },
  180. { -17,-17,-17,-17,-17,-17,-17,-17, 98, 98, 98, 98, 98, 98, 98, 98, 57, 57, 57, 57, 57, 57, 57, 57, -10,-10,-10,-10,-10,-10,-10,-10, },
  181. { -14,-14,-14,-14,-14,-14,-14,-14, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, -14,-14,-14,-14,-14,-14,-14,-14, },
  182. { -10,-10,-10,-10,-10,-10,-10,-10, 57, 57, 57, 57, 57, 57, 57, 57, 98, 98, 98, 98, 98, 98, 98, 98, -17,-17,-17,-17,-17,-17,-17,-17, },
  183. { -5, -5, -5, -5, -5, -5, -5, -5, 36, 36, 36, 36, 36, 36, 36, 36, 113,113,113,113,113,113,113,113, -16,-16,-16,-16,-16,-16,-16,-16, },
  184. { -2, -2, -2, -2, -2, -2, -2, -2, 17, 17, 17, 17, 17, 17, 17, 17, 124,124,124,124,124,124,124,124, -11,-11,-11,-11,-11,-11,-11,-11, },
  185. },
  186. {
  187. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  188. { -12,-12,-12,-12,-12,-12,-12,-12, 125,125,125,125,125,125,125,125, 17, 17, 17, 17, 17, 17, 17, 17, -2, -2, -2, -2, -2, -2, -2, -2, },
  189. { -17,-17,-17,-17,-17,-17,-17,-17, 114,114,114,114,114,114,114,114, 37, 37, 37, 37, 37, 37, 37, 37, -6, -6, -6, -6, -6, -6, -6, -6, },
  190. { -18,-18,-18,-18,-18,-18,-18,-18, 99, 99, 99, 99, 99, 99, 99, 99, 58, 58, 58, 58, 58, 58, 58, 58, -11,-11,-11,-11,-11,-11,-11,-11, },
  191. { -15,-15,-15,-15,-15,-15,-15,-15, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, -15,-15,-15,-15,-15,-15,-15,-15, },
  192. { -11,-11,-11,-11,-11,-11,-11,-11, 58, 58, 58, 58, 58, 58, 58, 58, 99, 99, 99, 99, 99, 99, 99, 99, -18,-18,-18,-18,-18,-18,-18,-18, },
  193. { -6, -6, -6, -6, -6, -6, -6, -6, 37, 37, 37, 37, 37, 37, 37, 37, 114,114,114,114,114,114,114,114, -17,-17,-17,-17,-17,-17,-17,-17, },
  194. { -2, -2, -2, -2, -2, -2, -2, -2, 17, 17, 17, 17, 17, 17, 17, 17, 125,125,125,125,125,125,125,125, -12,-12,-12,-12,-12,-12,-12,-12, },
  195. },
  196. {
  197. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
  198. { -12,-12,-12,-12,-12,-12,-12,-12, 124,124,124,124,124,124,124,124, 18, 18, 18, 18, 18, 18, 18, 18, -2, -2, -2, -2, -2, -2, -2, -2, },
  199. { -18,-18,-18,-18,-18,-18,-18,-18, 114,114,114,114,114,114,114,114, 38, 38, 38, 38, 38, 38, 38, 38, -6, -6, -6, -6, -6, -6, -6, -6, },
  200. { -19,-19,-19,-19,-19,-19,-19,-19, 99, 99, 99, 99, 99, 99, 99, 99, 59, 59, 59, 59, 59, 59, 59, 59, -11,-11,-11,-11,-11,-11,-11,-11, },
  201. { -16,-16,-16,-16,-16,-16,-16,-16, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, -16,-16,-16,-16,-16,-16,-16,-16, },
  202. { -11,-11,-11,-11,-11,-11,-11,-11, 59, 59, 59, 59, 59, 59, 59, 59, 99, 99, 99, 99, 99, 99, 99, 99, -19,-19,-19,-19,-19,-19,-19,-19, },
  203. { -6, -6, -6, -6, -6, -6, -6, -6, 38, 38, 38, 38, 38, 38, 38, 38, 114,114,114,114,114,114,114,114, -18,-18,-18,-18,-18,-18,-18,-18, },
  204. { -2, -2, -2, -2, -2, -2, -2, -2, 18, 18, 18, 18, 18, 18, 18, 18, 124,124,124,124,124,124,124,124, -12,-12,-12,-12,-12,-12,-12,-12, },
  205. },
  206. // Dummy entry for VP61 supporty
  207. {
  208. { 0, 0, 0, 0, 0, 0, 0, 0, 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
  209. { -4, -4, -4, -4, -4, -4, -4, -4, 118,118,118,118,118,118,118,118, 16, 16, 16, 16, 16, 16, 16, 16, -2, -2, -2, -2, -2, -2, -2, -2 },
  210. { -7, -7, -7, -7, -7, -7, -7, -7, 106,106,106,106,106,106,106,106, 34, 34, 34, 34, 34, 34, 34, 34, -5, -5, -5, -5, -5, -5, -5, -5 },
  211. { -8, -8, -8, -8, -8, -8, -8, -8, 90, 90, 90, 90, 90, 90, 90, 90, 53, 53, 53, 53, 53, 53, 53, 53, -7, -7, -7, -7, -7, -7, -7, -7 },
  212. { -8, -8, -8, -8, -8, -8, -8, -8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, -8, -8, -8, -8, -8, -8, -8, -8 },
  213. { -7, -7, -7, -7, -7, -7, -7, -7, 53, 53, 53, 53, 53, 53, 53, 53, 90, 90, 90, 90, 90, 90, 90, 90, -8, -8, -8, -8, -8, -8, -8, -8 },
  214. { -5, -5, -5, -5, -5, -5, -5, -5, 34, 34, 34, 34, 34, 34, 34, 34, 106,106,106,106,106,106,106,106, -7, -7, -7, -7, -7, -7, -7, -7 },
  215. { -2, -2, -2, -2, -2, -2, -2, -2, 16, 16, 16, 16, 16, 16, 16, 16, 118,118,118,118,118,118,118,118, -4, -4, -4, -4, -4, -4, -4, -4 }
  216. }
  217. };
  218. void FilterBlock1d_h_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  219. {
  220. __asm
  221. {
  222. mov edi, Filter
  223. movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
  224. movq mm2, [edi+ 16] ; mm3 *= kernel 0 modifiers.
  225. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
  226. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
  227. mov edi,OutputPtr
  228. mov esi,SrcPtr
  229. dec esi
  230. mov ecx, DWORD PTR OutputHeight
  231. mov eax, OutputWidth ; destination pitch?
  232. pxor mm0, mm0 ; mm0 = 00000000
  233. nextrow:
  234. movq mm3, [esi] ; mm3 = p-1..p6
  235. movq mm4, mm3 ; mm4 = p-1..p6
  236. punpcklbw mm3, mm0 ; mm3 = p-1..p2
  237. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  238. psrlq mm4, 24 ; mm4 = p2..p6
  239. movq mm5, mm4 ; mm5 = p2..p6
  240. punpcklbw mm5, mm0 ; mm5 = p2..p5
  241. pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
  242. paddsw mm3, mm5 ; mm3 += mm5
  243. movq mm4, [esi+1] ; mm4 = p0..p6
  244. movq mm5, mm4 ; mm5 = p0..p6
  245. punpcklbw mm5, mm0 ; mm5 = p0..p3
  246. pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
  247. paddsw mm3, mm5 ; mm3 += mm5
  248. psrlq mm4, 8 ; mm4 = p1..p6
  249. movq mm5, mm4 ; mm5 = p1..p6
  250. punpcklbw mm5, mm0 ; mm5 = p1..p4
  251. pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
  252. paddsw mm3, mm5 ; mm3 += mm5
  253. paddsw mm3, rd ; mm3 += round value
  254. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  255. packuswb mm3, mm0 ; pack and unpack to saturate
  256. movd [edi],mm3 ; store the results in the destination
  257. movq mm3, [esi+4] ; mm3 = p-1..p6
  258. movq mm4, mm3 ; mm4 = p-1..p6
  259. punpcklbw mm3, mm0 ; mm3 = p-1..p2
  260. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  261. psrlq mm4, 24 ; mm4 = p2..p6
  262. movq mm5, mm4 ; mm5 = p2..p6
  263. punpcklbw mm5, mm0 ; mm5 = p2..p5
  264. pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
  265. paddsw mm3, mm5 ; mm3 += mm5
  266. movq mm4, [esi+5] ; mm4 = p0..p6
  267. movq mm5, mm4 ; mm5 = p0..p6
  268. punpcklbw mm5, mm0 ; mm5 = p0..p3
  269. pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
  270. paddsw mm3, mm5 ; mm3 += mm5
  271. psrlq mm4, 8 ; mm4 = p1..p6
  272. movq mm5, mm4 ; mm5 = p1..p6
  273. punpcklbw mm5, mm0 ; mm5 = p1..p4
  274. pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
  275. paddsw mm3, mm5 ; mm3 += mm5
  276. paddsw mm3, rd ; mm3 += round value
  277. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  278. packuswb mm3, mm0 ; pack and unpack to saturate
  279. movd [edi+4],mm3 ; store the results in the destination
  280. add esi,SrcPixelsPerLine ; next line
  281. add edi,eax;
  282. dec ecx ; decrement count
  283. jnz nextrow ; next row
  284. }
  285. }
  286. void FilterBlock1d_v_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  287. {
  288. __asm
  289. {
  290. mov edi, Filter
  291. movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
  292. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
  293. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
  294. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
  295. mov edx, PixelsPerLine
  296. mov edi, OutputPtr
  297. mov esi, SrcPtr
  298. sub esi, PixelsPerLine
  299. mov ecx, DWORD PTR OutputHeight
  300. mov eax, OutputWidth ; destination pitch?
  301. pxor mm0, mm0 ; mm0 = 00000000
  302. nextrow:
  303. movq mm3, [esi] ; mm3 = p0..p8
  304. punpcklbw mm3, mm0 ; mm3 = p0..p3
  305. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  306. add esi, edx ; move source forward 1 line to avoid 3 * pitch
  307. movq mm4, [esi+2*edx] ; mm4 = p0..p8
  308. punpcklbw mm4, mm0 ; mm4 = p0..p3
  309. pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
  310. paddsw mm3, mm4 ; mm3 += mm4
  311. movq mm4, [esi ] ; mm4 = p0..p8
  312. punpcklbw mm4, mm0 ; mm4 = p0..p3
  313. pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
  314. paddsw mm3, mm4 ; mm3 += mm4
  315. movq mm4, [esi +edx] ; mm4 = p0..p8
  316. punpcklbw mm4, mm0 ; mm4 = p0..p3
  317. pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
  318. paddsw mm3, mm4 ; mm3 += mm4
  319. paddsw mm3, rd ; mm3 += round value
  320. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  321. packuswb mm3, mm0 ; pack and saturate
  322. movd [edi],mm3 ; store the results in the destination
  323. sub esi, edx ; subtract edx to get back to -1 column
  324. movq mm3, [esi+4] ; mm3 = p4..p12
  325. punpcklbw mm3, mm0 ; mm3 = p4..p7
  326. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  327. add esi, edx ; move source forward 1 line to avoid 3 * pitch
  328. movq mm4, [esi+2*edx+4] ; mm4 = p0..p8
  329. punpcklbw mm4, mm0 ; mm4 = p0..p3
  330. pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
  331. paddsw mm3, mm4 ; mm3 += mm4
  332. movq mm4, [esi +4] ; mm4 = p0..p8
  333. punpcklbw mm4, mm0 ; mm4 = p0..p3
  334. pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
  335. paddsw mm3, mm4 ; mm3 += mm4
  336. movq mm4, [esi +edx+4] ; mm4 = p0..p8
  337. punpcklbw mm4, mm0 ; mm4 = p0..p3
  338. pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
  339. paddsw mm3, mm4 ; mm3 += mm4
  340. paddsw mm3, rd ; mm3 += round value
  341. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  342. packuswb mm3, mm0 ; pack and saturate
  343. movd [edi+4],mm3 ; store the results in the destination
  344. // the subsequent iterations repeat 3 out of 4 of these reads. Since the
  345. // recon block should be in cache this shouldn't cost much. Its obviously
  346. // avoidable!!!.
  347. add edi,eax;
  348. dec ecx ; decrement count
  349. jnz nextrow ; next row
  350. }
  351. }
  352. void FilterBlock1d_h_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  353. {
  354. __asm
  355. {
  356. mov edi, Filter
  357. movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
  358. movq mm2, [edi+ 16] ; mm3 *= kernel 0 modifiers.
  359. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
  360. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
  361. mov edi,OutputPtr
  362. mov esi,SrcPtr
  363. dec esi
  364. mov ecx, DWORD PTR OutputHeight
  365. mov eax, OutputWidth ; destination pitch?
  366. pxor mm0, mm0 ; mm0 = 00000000
  367. nextrow:
  368. movq mm3, [esi] ; mm3 = p-1..p6
  369. movq mm4, mm3 ; mm4 = p-1..p6
  370. punpcklbw mm3, mm0 ; mm3 = p-1..p2
  371. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  372. psrlq mm4, 8 ; mm4 = p0..p6
  373. movq mm5, mm4 ; mm5 = p0..p6
  374. punpcklbw mm5, mm0 ; mm5 = p0..p3
  375. pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
  376. paddw mm3, mm5 ; mm3 += mm5
  377. psrlq mm4, 8 ; mm4 = p1..p6
  378. movq mm5, mm4 ; mm5 = p1..p6
  379. punpcklbw mm5, mm0 ; mm5 = p1..p4
  380. pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
  381. paddw mm3, mm5 ; mm3 += mm5
  382. psrlq mm4, 8 ; mm4 = p2..p6
  383. movq mm5, mm4 ; mm5 = p2..p6
  384. punpcklbw mm5, mm0 ; mm5 = p2..p5
  385. pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
  386. paddw mm3, mm5 ; mm3 += mm5
  387. paddw mm3, rd ; mm3 += round value
  388. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  389. packuswb mm3, mm0 ; pack and unpack to saturate
  390. movd [edi],mm3 ; store the results in the destination
  391. movq mm3, [esi+4] ; mm3 = p-1..p6
  392. movq mm4, mm3 ; mm4 = p-1..p6
  393. punpcklbw mm3, mm0 ; mm3 = p-1..p2
  394. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  395. psrlq mm4, 8 ; mm4 = p0..p6
  396. movq mm5, mm4 ; mm5 = p0..p6
  397. punpcklbw mm5, mm0 ; mm5 = p0..p3
  398. pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
  399. paddw mm3, mm5 ; mm3 += mm5
  400. psrlq mm4, 8 ; mm4 = p1..p6
  401. movq mm5, mm4 ; mm5 = p1..p6
  402. punpcklbw mm5, mm0 ; mm5 = p1..p4
  403. pmullw mm5, mm6 ; mm5 *= kernel 2 modifiers
  404. paddw mm3, mm5 ; mm3 += mm5
  405. psrlq mm4, 8 ; mm4 = p2..p6
  406. movq mm5, mm4 ; mm5 = p2..p6
  407. punpcklbw mm5, mm0 ; mm5 = p2..p5
  408. pmullw mm5, mm7 ; mm5 *= kernel 3 modifiers
  409. paddw mm3, mm5 ; mm3 += mm5
  410. paddw mm3, rd ; mm3 += round value
  411. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  412. packuswb mm3, mm0 ; pack and unpack to saturate
  413. movd [edi+4],mm3 ; store the results in the destination
  414. add esi,SrcPixelsPerLine ; next line
  415. add edi,eax;
  416. dec ecx ; decrement count
  417. jnz nextrow ; next row
  418. }
  419. }
  420. void FilterBlock1d_v_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  421. {
  422. __asm
  423. {
  424. mov edi, Filter
  425. movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
  426. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
  427. movq mm6, [edi + 32] ; mm3 *= kernel 0 modifiers.
  428. movq mm7, [edi + 48] ; mm3 *= kernel 0 modifiers.
  429. mov edx, PixelsPerLine
  430. mov edi, OutputPtr
  431. mov esi, SrcPtr
  432. sub esi, PixelsPerLine
  433. mov ecx, DWORD PTR OutputHeight
  434. mov eax, OutputWidth ; destination pitch?
  435. pxor mm0, mm0 ; mm0 = 00000000
  436. nextrow:
  437. movq mm3, [esi] ; mm3 = p0..p8
  438. punpcklbw mm3, mm0 ; mm3 = p0..p3
  439. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  440. movq mm4, [esi +edx ] ; mm4 = p0..p8
  441. punpcklbw mm4, mm0 ; mm4 = p0..p3
  442. pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
  443. paddw mm3, mm4 ; mm3 += mm4
  444. movq mm4, [esi +2*edx] ; mm4 = p0..p8
  445. punpcklbw mm4, mm0 ; mm4 = p0..p3
  446. pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
  447. paddw mm3, mm4 ; mm3 += mm4
  448. add esi, edx ; move source forward 1 line to avoid 3 * pitch
  449. movq mm4, [esi+2*edx] ; mm4 = p0..p8
  450. punpcklbw mm4, mm0 ; mm4 = p0..p3
  451. pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
  452. paddw mm3, mm4 ; mm3 += mm4
  453. paddw mm3, rd ; mm3 += round value
  454. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  455. packuswb mm3, mm0 ; pack and saturate
  456. movd [edi],mm3 ; store the results in the destination
  457. sub esi, edx ; subtract edx to get back to -1 column
  458. movq mm3, [esi+4] ; mm3 = p4..p12
  459. punpcklbw mm3, mm0 ; mm3 = p4..p7
  460. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  461. movq mm4, [esi +edx +4] ; mm4 = p0..p8
  462. punpcklbw mm4, mm0 ; mm4 = p0..p3
  463. pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
  464. paddw mm3, mm4 ; mm3 += mm4
  465. movq mm4, [esi +2*edx+4] ; mm4 = p0..p8
  466. punpcklbw mm4, mm0 ; mm4 = p0..p3
  467. pmullw mm4, mm6 ; mm4 *= kernel 2 modifiers.
  468. paddw mm3, mm4 ; mm3 += mm4
  469. add esi, edx ; move source forward 1 line to avoid 3 * pitch
  470. movq mm4, [esi+2*edx+4] ; mm4 = p0..p8
  471. punpcklbw mm4, mm0 ; mm4 = p0..p3
  472. pmullw mm4, mm7 ; mm4 *= kernel 3 modifiers.
  473. paddw mm3, mm4 ; mm3 += mm4
  474. paddw mm3, rd ; mm3 += round value
  475. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  476. packuswb mm3, mm0 ; pack and saturate
  477. movd [edi+4],mm3 ; store the results in the destination
  478. // the subsequent iterations repeat 3 out of 4 of these reads. Since the
  479. // recon block should be in cache this shouldn't cost much. Its obviously
  480. // avoidable!!!.
  481. add edi,eax;
  482. dec ecx ; decrement count
  483. jnz nextrow ; next row
  484. }
  485. }
  486. void FilterBlock1d_hb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  487. {
  488. __asm
  489. {
  490. mov edi, Filter
  491. movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
  492. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
  493. mov edi,OutputPtr
  494. mov esi,SrcPtr
  495. mov ecx, DWORD PTR OutputHeight
  496. mov eax, OutputWidth ; destination pitch?
  497. pxor mm0, mm0 ; mm0 = 00000000
  498. nextrow:
  499. movq mm3, [esi] ; mm3 = p-1..p14
  500. movq mm4, mm3 ; mm4 = p-1..p14
  501. punpcklbw mm3, mm0 ; mm3 = p-1..p6
  502. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  503. psrlq mm4, 8 ; mm4 = p0..p13
  504. movq mm5, mm4 ; mm5 = p0..p13
  505. punpcklbw mm5, mm0 ; mm5 = p0..p7
  506. pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
  507. paddw mm3, mm5 ; mm3 += mm5
  508. paddw mm3, rd ; mm3 += round value
  509. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  510. packuswb mm3, mm0 ; pack and unpack to saturate
  511. movd [edi],mm3 ; store the results in the destination
  512. movq mm3, [esi+4] ; mm3 = p-1..p14
  513. movq mm4, mm3 ; mm4 = p-1..p14
  514. punpcklbw mm3, mm0 ; mm3 = p-1..p6
  515. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  516. psrlq mm4, 8 ; mm4 = p0..p13
  517. movq mm5, mm4 ; mm5 = p0..p13
  518. punpcklbw mm5, mm0 ; mm5 = p0..p7
  519. pmullw mm5, mm2 ; mm5 *= kernel 1 modifiers
  520. paddw mm3, mm5 ; mm3 += mm5
  521. paddw mm3, rd ; mm3 += round value
  522. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  523. packuswb mm3, mm0 ; pack and unpack to saturate
  524. movd [edi+4],mm3 ; store the results in the destination
  525. add esi,SrcPixelsPerLine ; next line
  526. add edi,eax;
  527. dec ecx ; decrement count
  528. jnz nextrow ; next row
  529. }
  530. }
  531. void FilterBlock1d_vb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  532. {
  533. __asm
  534. {
  535. mov edi, Filter
  536. movq mm1, [edi] ; mm3 *= kernel 0 modifiers.
  537. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers.
  538. mov edx, PixelsPerLine
  539. mov edi, OutputPtr
  540. mov esi, SrcPtr
  541. mov ecx, DWORD PTR OutputHeight
  542. mov eax, OutputWidth ; destination pitch?
  543. pxor mm0, mm0 ; mm0 = 00000000
  544. nextrow:
  545. movq mm3, [esi] ; mm3 = p0..p16
  546. punpcklbw mm3, mm0 ; mm3 = p0..p8
  547. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  548. movq mm4, [esi +edx ] ; mm4 = p0..p16
  549. punpcklbw mm4, mm0 ; mm4 = p0..p8
  550. pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
  551. paddw mm3, mm4 ; mm3 += mm4
  552. paddw mm3, rd ; mm3 += round value
  553. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  554. packuswb mm3, mm0 ; pack and unpack to saturate
  555. movd [edi],mm3 ; store the results in the destination
  556. movq mm3, [esi+4] ; mm3 = p0..p16
  557. punpcklbw mm3, mm0 ; mm3 = p0..p8
  558. pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers.
  559. movq mm4, [esi +edx +4] ; mm4 = p0..p16
  560. punpcklbw mm4, mm0 ; mm4 = p0..p8
  561. pmullw mm4, mm2 ; mm4 *= kernel 1 modifiers.
  562. paddw mm3, mm4 ; mm3 += mm4
  563. paddw mm3, rd ; mm3 += round value
  564. psraw mm3, FILTER_SHIFT ; mm3 /= 128
  565. packuswb mm3, mm0 ; pack and unpack to saturate
  566. movd [edi+4],mm3 ; store the results in the destination
  567. // the subsequent iterations repeat 3 out of 4 of these reads. Since the
  568. // recon block should be in cache this shouldn't cost much. Its obviously
  569. // avoidable!!!.
  570. add esi,edx
  571. add edi,eax
  572. dec ecx ; decrement count
  573. jnz nextrow ; next row
  574. }
  575. }
  576. /****************************************************************************
  577. *
  578. * ROUTINE : FilterBlock2dBil
  579. *
  580. * INPUTS : Pointer to source data
  581. *
  582. * OUTPUTS : Filtered data
  583. *
  584. * RETURNS : None.
  585. *
  586. * FUNCTION : Applies a bilinear filter on the intput data to produce
  587. * a predictor block (UINT16)
  588. *
  589. * SPECIAL NOTES :
  590. *
  591. * ERRORS : None.
  592. *
  593. ****************************************************************************/
  594. _inline
  595. void FilterBlock2dBil_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
  596. {
  597. __asm
  598. {
  599. mov eax, HFilter ;
  600. mov edi, OutputPtr ;
  601. mov esi, SrcPtr ;
  602. lea ecx, [edi+64] ;
  603. mov edx, SrcPixelsPerLine ;
  604. movq mm1, [eax] ;
  605. movq mm2, [eax+16] ;
  606. mov eax, VFilter ;
  607. pxor mm0, mm0 ;
  608. // get the first horizontal line done ;
  609. movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  610. movq mm4, mm3 ; make a copy of current line
  611. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  612. punpckhbw mm4, mm0 ;
  613. pmullw mm3, mm1 ;
  614. pmullw mm4, mm1 ;
  615. movq mm5, [esi+1] ;
  616. movq mm6, mm5 ;
  617. punpcklbw mm5, mm0 ;
  618. punpckhbw mm6, mm0 ;
  619. pmullw mm5, mm2 ;
  620. pmullw mm6, mm2 ;
  621. paddw mm3, mm5 ;
  622. paddw mm4, mm6 ;
  623. paddw mm3, rd ; xmm3 += round value
  624. psraw mm3, FILTER_SHIFT ; xmm3 /= 128
  625. paddw mm4, rd ;
  626. psraw mm4, FILTER_SHIFT ;
  627. movq mm7, mm3 ;
  628. packuswb mm7, mm4 ;
  629. add esi, edx ; next line
  630. NextRow:
  631. movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  632. movq mm4, mm3 ; make a copy of current line
  633. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  634. punpckhbw mm4, mm0 ;
  635. pmullw mm3, mm1 ;
  636. pmullw mm4, mm1 ;
  637. movq mm5, [esi+1] ;
  638. movq mm6, mm5 ;
  639. punpcklbw mm5, mm0 ;
  640. punpckhbw mm6, mm0 ;
  641. pmullw mm5, mm2 ;
  642. pmullw mm6, mm2 ;
  643. paddw mm3, mm5 ;
  644. paddw mm4, mm6 ;
  645. movq mm5, mm7 ;
  646. movq mm6, mm7 ;
  647. punpcklbw mm5, mm0 ;
  648. punpckhbw mm6, mm0
  649. pmullw mm5, [eax] ;
  650. pmullw mm6, [eax] ;
  651. paddw mm3, rd ; xmm3 += round value
  652. psraw mm3, FILTER_SHIFT ; xmm3 /= 128
  653. paddw mm4, rd ;
  654. psraw mm4, FILTER_SHIFT ;
  655. movq mm7, mm3 ;
  656. packuswb mm7, mm4 ;
  657. pmullw mm3, [eax+16] ;
  658. pmullw mm4, [eax+16] ;
  659. paddw mm3, mm5 ;
  660. paddw mm4, mm6 ;
  661. paddw mm3, rd ; xmm3 += round value
  662. psraw mm3, FILTER_SHIFT ; xmm3 /= 128
  663. paddw mm4, rd ;
  664. psraw mm4, FILTER_SHIFT ;
  665. packuswb mm3, mm4
  666. movq [edi], mm3 ; store the results in the destination
  667. add esi, edx ; next line
  668. add edi, 8 ;
  669. cmp edi, ecx ;
  670. jne NextRow
  671. }
  672. // First filter 1d Horizontal
  673. //FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
  674. // Now filter Verticaly
  675. //FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
  676. }
  677. /****************************************************************************
  678. *
  679. * ROUTINE : FilterBlockBil_8
  680. *
  681. * INPUTS : ReconPtr1, ReconPtr12
  682. * Two pointers into the block of data to be filtered
  683. * These pointers bound the fractional pel position
  684. * PixelsPerLine
  685. * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
  686. * Modx, ModY
  687. * The fractional pel bits used to select a filter.
  688. *
  689. *
  690. * OUTPUTS : ReconRefPtr
  691. * A pointer to an 8x8 buffer into which UINT8 filtered data is written.
  692. *
  693. * RETURNS : None.
  694. *
  695. * FUNCTION : Produces a bilinear filtered fractional pel prediction block
  696. * with UINT8 output
  697. *
  698. * SPECIAL NOTES :
  699. *
  700. * ERRORS : None.
  701. *
  702. ****************************************************************************/
  703. void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
  704. {
  705. int diff;
  706. // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
  707. diff=ReconPtr2-ReconPtr1;
  708. // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
  709. // This works out to be what we want... despite the pointer swapping that goes on below.
  710. // For example... if the X component of the vector is a +ve ModX = X%8.
  711. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
  712. if(diff<0)
  713. { // swap pointers so ReconPtr1 smaller
  714. UINT8 *temp=ReconPtr1;
  715. ReconPtr1=ReconPtr2;
  716. ReconPtr2=temp;
  717. diff= (int)(ReconPtr2-ReconPtr1);
  718. }
  719. if( diff==1 )
  720. {
  721. FilterBlock1d_hb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] );
  722. }
  723. else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
  724. {
  725. FilterBlock1d_vb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]);
  726. }
  727. else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
  728. {
  729. FilterBlock2dBil_mmx( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
  730. }
  731. else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
  732. {
  733. FilterBlock2dBil_mmx( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
  734. }
  735. }
  736. /****************************************************************************
  737. *
  738. * ROUTINE : FilterBlock2d
  739. *
  740. * INPUTS : Pointer to source data
  741. *
  742. * OUTPUTS : Filtered data
  743. *
  744. * RETURNS : None.
  745. *
  746. * FUNCTION : Applies a 2d 4 tap filter on the intput data to produce
  747. * a predictor block (UINT16)
  748. *
  749. * SPECIAL NOTES :
  750. *
  751. * ERRORS : None.
  752. *
  753. ****************************************************************************/
  754. void FilterBlock2d_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
  755. {
  756. UINT8 Intermediate[256];
  757. // First filter 1d Horizontal
  758. FilterBlock1d_h_mmx(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );
  759. // Now filter Verticaly
  760. FilterBlock1d_v_mmx(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
  761. }
  762. /****************************************************************************
  763. *
  764. * ROUTINE : FilterBlock
  765. *
  766. * INPUTS : ReconPtr1, ReconPtr12
  767. * Two pointers into the block of data to be filtered
  768. * These pointers bound the fractional pel position
  769. * PixelsPerLine
  770. * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
  771. * Modx, ModY
  772. * The fractional pel bits used to select a filter.
  773. * UseBicubic
  774. * Whether to use the bicubuc filter set or the bilinear set
  775. *
  776. *
  777. * OUTPUTS : ReconRefPtr
  778. * A pointer to an 8x8 buffer into which the filtered data is written.
  779. *
  780. * RETURNS : None.
  781. *
  782. * FUNCTION : Produces a filtered fractional pel prediction block
  783. * using bilinear or bicubic filters
  784. *
  785. * SPECIAL NOTES :
  786. *
  787. * ERRORS : None.
  788. *
  789. ****************************************************************************/
  790. void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
  791. {
  792. int diff;
  793. UINT8 Intermediate[256];
  794. // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
  795. diff=ReconPtr2-ReconPtr1;
  796. // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
  797. // This works out to be what we want... despite the pointer swapping that goes on below.
  798. // For example... if the X component of the vector is a +ve ModX = X%8.
  799. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
  800. if(diff<0)
  801. { // swap pointers so ReconPtr1 smaller
  802. UINT8 *temp=ReconPtr1;
  803. ReconPtr1=ReconPtr2;
  804. ReconPtr2=temp;
  805. diff= (int)(ReconPtr2-ReconPtr1);
  806. }
  807. if(!diff)
  808. {
  809. return;
  810. }
  811. if( diff==1 )
  812. { // Fractional pixel in horizontal only
  813. if ( UseBicubic )
  814. FilterBlock1d_h_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
  815. else
  816. FilterBlock1d_hb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] );
  817. }
  818. else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
  819. {
  820. if ( UseBicubic )
  821. FilterBlock1d_v_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
  822. else
  823. FilterBlock1d_vb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]);
  824. }
  825. else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
  826. {
  827. if ( UseBicubic )
  828. FilterBlock2d_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
  829. else
  830. FilterBlock2dBil_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
  831. }
  832. else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
  833. {
  834. if ( UseBicubic )
  835. FilterBlock2d_mmx( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
  836. else
  837. FilterBlock2dBil_mmx( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
  838. }
  839. UnpackBlock_MMX( Intermediate, ReconRefPtr, 8 );
  840. }