filtwmt.c 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790
  1. /****************************************************************************
  2. *
  3. * Module Title : newLoopTest_asm.c
  4. *
  5. * Description : Codec specific functions
  6. *
  7. * AUTHOR : Yaowu Xu
  8. *
  9. *****************************************************************************
  10. * Revision History
  11. *
  12. * 1.02 YWX 03-Nov-00 Changed confusing variable name
  13. * 1.01 YWX 02-Nov-00 Added the set of functions
  14. * 1.00 YWX 19-Oct-00 configuration baseline
  15. *****************************************************************************
  16. */
  17. /****************************************************************************
  18. * Header Frames
  19. *****************************************************************************
  20. */
  21. #define STRICT /* Strict type checking. */
  22. #include "codec_common.h"
  23. #include <math.h>
  24. /****************************************************************************
  25. * Module constants.
  26. *****************************************************************************
  27. */
  28. #define MIN(a, b) (((a) < (b)) ? (a) : (b))
  29. #define FILTER_WEIGHT 128
  30. #define FILTER_SHIFT 7
  31. __declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};
  32. __declspec(align(16)) INT16 BilinearFilters_wmt[8][16] =
  33. {
  34. { 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0 },
  35. { 112,112,112,112,112,112,112,112, 16, 16, 16, 16, 16, 16, 16, 16 },
  36. { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
  37. { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
  38. { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
  39. { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
  40. { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
  41. { 16, 16, 16, 16, 16, 16, 16, 16, 112,112,112,112,112,112,112,112 }
  42. };
  43. extern __declspec(align(16)) INT16 BicubicFilters_mmx[17][8][32];
  44. _inline
  45. void FilterBlock1d_h_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  46. {
  47. __asm
  48. {
  49. mov edi, Filter
  50. movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
  51. movdqa xmm2, [edi+ 16] ; xmm3 *= kernel 0 modifiers.
  52. movdqa xmm6, [edi + 32] ; xmm3 *= kernel 0 modifiers.
  53. movdqa xmm7, [edi + 48] ; xmm3 *= kernel 0 modifiers.
  54. mov edi,OutputPtr
  55. mov esi,SrcPtr
  56. dec esi
  57. mov ecx, DWORD PTR OutputHeight
  58. mov eax, OutputWidth ; destination pitch?
  59. pxor xmm0, xmm0 ; xmm0 = 00000000
  60. nextrow:
  61. // kernel 0 and 3 are potentially negative taps. These negative tap filters
  62. // must be done first or we could have problems saturating our high value
  63. // tap filters
  64. movdqu xmm3, [esi] ; xmm3 = p-1..p14
  65. movdqu xmm4, xmm3 ; xmm4 = p-1..p14
  66. punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
  67. pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
  68. psrldq xmm4, 3 ; xmm4 = p2..p13
  69. movdqa xmm5, xmm4 ; xmm5 = p2..p13
  70. punpcklbw xmm5, xmm0 ; xmm5 = p2..p7
  71. pmullw xmm5, xmm7 ; xmm5 *= kernel 3 modifiers
  72. paddsw xmm3, xmm5 ; xmm3 += xmm5
  73. movdqu xmm4, [esi+1] ; xmm4 = p0..p13
  74. movdqa xmm5, xmm4 ; xmm5 = p0..p13
  75. punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
  76. pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
  77. paddsw xmm3, xmm5 ; xmm3 += xmm5
  78. psrldq xmm4, 1 ; xmm4 = p1..p13
  79. movdqa xmm5, xmm4 ; xmm5 = p1..p13
  80. punpcklbw xmm5, xmm0 ; xmm5 = p1..p7
  81. pmullw xmm5, xmm6 ; xmm5 *= kernel 2 modifiers
  82. paddsw xmm3, xmm5 ; xmm3 += xmm5
  83. paddsw xmm3, rd ; xmm3 += round value
  84. psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
  85. packuswb xmm3, xmm0 ; pack and saturate
  86. movdq2q mm0, xmm3
  87. movq [edi],mm0 ; store the results in the destination
  88. add esi,SrcPixelsPerLine ; next line
  89. add edi,eax;
  90. dec ecx ; decrement count
  91. jnz nextrow ; next row
  92. }
  93. }
  94. _inline
  95. void FilterBlock1d_v_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  96. {
  97. __asm
  98. {
  99. mov edi, Filter
  100. movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
  101. movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
  102. movdqa xmm6, [edi + 32] ; xmm3 *= kernel 0 modifiers.
  103. movdqa xmm7, [edi + 48] ; xmm3 *= kernel 0 modifiers.
  104. mov edx, PixelsPerLine
  105. mov edi, OutputPtr
  106. mov esi, SrcPtr
  107. sub esi, PixelsPerLine
  108. mov ecx, DWORD PTR OutputHeight
  109. mov eax, OutputWidth ; destination pitch?
  110. pxor xmm0, xmm0 ; xmm0 = 00000000
  111. nextrow:
  112. movdqu xmm3, [esi] ; xmm3 = p0..p16
  113. punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
  114. pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
  115. add esi, edx ; move source forward 1 line to avoid 3 * pitch
  116. movdqu xmm4, [esi+2*edx] ; xmm4 = p0..p16
  117. punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
  118. pmullw xmm4, xmm7 ; xmm4 *= kernel 3 modifiers.
  119. paddsw xmm3, xmm4 ; xmm3 += xmm4
  120. movdqu xmm4, [esi ] ; xmm4 = p0..p16
  121. punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
  122. pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
  123. paddsw xmm3, xmm4 ; xmm3 += xmm4
  124. movdqu xmm4, [esi +edx] ; xmm4 = p0..p16
  125. punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
  126. pmullw xmm4, xmm6 ; xmm4 *= kernel 2 modifiers.
  127. paddsw xmm3, xmm4 ; xmm3 += xmm4
  128. paddsw xmm3, rd ; xmm3 += round value
  129. psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
  130. packuswb xmm3, xmm0 ; pack and unpack to saturate
  131. movdq2q mm0, xmm3
  132. movq [edi],mm0 ; store the results in the destination
  133. // the subsequent iterations repeat 3 out of 4 of these reads. Since the
  134. // recon block should be in cache this shouldn't cost much. Its obviously
  135. // avoidable!!!.
  136. add edi,eax;
  137. dec ecx ; decrement count
  138. jnz nextrow ; next row
  139. }
  140. }
  141. _inline
  142. void FilterBlock1d_hb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  143. {
  144. __asm
  145. {
  146. mov edi, Filter
  147. movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
  148. movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
  149. mov edi,OutputPtr
  150. mov esi,SrcPtr
  151. mov ecx, DWORD PTR OutputHeight
  152. mov eax, OutputWidth ; destination pitch?
  153. pxor xmm0, xmm0 ; xmm0 = 00000000
  154. nextrow:
  155. movdqu xmm3, [esi] ; xmm3 = p-1..p14
  156. movdqu xmm5, xmm3 ; xmm4 = p-1..p14
  157. punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
  158. pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
  159. psrldq xmm5, 1 ; xmm4 = p0..p13
  160. punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
  161. pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
  162. paddw xmm3, xmm5 ; xmm3 += xmm5
  163. paddw xmm3, rd ; xmm3 += round value
  164. psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
  165. packuswb xmm3, xmm0 ; pack and unpack to saturate
  166. movdq2q mm0, xmm3
  167. movq [edi],mm0 ; store the results in the destination
  168. add esi,SrcPixelsPerLine ; next line
  169. add edi,eax;
  170. dec ecx ; decrement count
  171. jnz nextrow ; next row
  172. }
  173. }
  174. _inline
  175. void FilterBlock1d_vb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  176. {
  177. __asm
  178. {
  179. mov edi, Filter
  180. movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
  181. movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
  182. mov edx, PixelsPerLine
  183. mov edi, OutputPtr
  184. mov esi, SrcPtr
  185. mov ecx, DWORD PTR OutputHeight
  186. mov eax, OutputWidth ; destination pitch?
  187. pxor xmm0, xmm0 ; xmm0 = 00000000
  188. nextrow:
  189. movdqu xmm3, [esi] ; xmm3 = p0..p16
  190. punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
  191. pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
  192. movdqu xmm4, [esi +edx ] ; xmm4 = p0..p16
  193. punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
  194. pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
  195. paddw xmm3, xmm4 ; xmm3 += xmm4
  196. paddw xmm3, rd ; xmm3 += round value
  197. psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
  198. packuswb xmm3, xmm0 ; pack and unpack to saturate
  199. movdq2q mm0, xmm3
  200. movq [edi],mm0 ; store the results in the destination
  201. // the subsequent iterations repeat 3 out of 4 of these reads. Since the
  202. // recon block should be in cache this shouldn't cost much. Its obviously
  203. // avoidable!!!.
  204. add esi,edx
  205. add edi,eax
  206. dec ecx ; decrement count
  207. jnz nextrow ; next row
  208. }
  209. }
  210. /****************************************************************************
  211. *
  212. * ROUTINE : FilterBlock2dBil
  213. *
  214. * INPUTS : Pointer to source data
  215. *
  216. * OUTPUTS : Filtered data
  217. *
  218. * RETURNS : None.
  219. *
  220. * FUNCTION : Applies a bilinear filter on the intput data to produce
  221. * a predictor block (UINT16)
  222. *
  223. * SPECIAL NOTES :
  224. *
  225. * ERRORS : None.
  226. *
  227. ****************************************************************************/
  228. _inline
  229. void FilterBlock2dBil_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
  230. {
  231. __asm
  232. {
  233. mov eax, HFilter ;
  234. mov edi, OutputPtr ;
  235. mov esi, SrcPtr ;
  236. lea ecx, [edi+64] ;
  237. mov edx, SrcPixelsPerLine ;
  238. movdqa xmm1, [eax] ;
  239. movdqa xmm2, [eax+16] ;
  240. mov eax, VFilter ;
  241. pxor xmm0, xmm0 ;
  242. // get the first horizontal line done ;
  243. movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  244. movdqa xmm4, xmm3 ; make a copy of current line
  245. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  246. psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
  247. pmullw xmm3, xmm1 ;
  248. punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
  249. pmullw xmm4, xmm2 ;
  250. paddw xmm3, xmm4 ;
  251. paddw xmm3, rd ;
  252. psraw xmm3, FILTER_SHIFT ; ready for output
  253. movdqa xmm5, xmm3 ;
  254. add esi, edx ; next line
  255. NextRow:
  256. pmullw xmm5, [eax] ;
  257. movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  258. movdqa xmm4, xmm3 ; make a copy of current line
  259. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  260. psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
  261. pmullw xmm3, xmm1 ;
  262. punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
  263. movdqa xmm6, xmm5 ;
  264. pmullw xmm4, xmm2 ;
  265. paddw xmm3, xmm4 ;
  266. paddw xmm3, rd ;
  267. psraw xmm3, FILTER_SHIFT ; ready for output
  268. movdqa xmm5, xmm3 ; make a copy for the next row
  269. pmullw xmm3, [eax+16] ;
  270. paddw xmm6, xmm3 ;
  271. paddw xmm6, rd ; xmm6 += round value
  272. psraw xmm6, FILTER_SHIFT ; xmm6 /= 128
  273. packuswb xmm6, xmm0 ; pack and unpack to saturate
  274. movdq2q mm0, xmm6
  275. movq [edi], mm0 ; store the results in the destination
  276. add esi, edx ; next line
  277. add edi, 8 ;
  278. cmp edi, ecx ;
  279. jne NextRow
  280. }
  281. // First filter 1d Horizontal
  282. //FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
  283. // Now filter Verticaly
  284. //FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
  285. }
  286. _inline
  287. void FilterUnpackBlock2dBil_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
  288. {
  289. __asm
  290. {
  291. mov eax, HFilter ;
  292. mov edi, OutputPtr ;
  293. mov esi, SrcPtr ;
  294. lea ecx, [edi+128] ;
  295. mov edx, SrcPixelsPerLine ;
  296. movdqa xmm1, [eax] ;
  297. movdqa xmm2, [eax+16] ;
  298. mov eax, VFilter ;
  299. pxor xmm0, xmm0 ;
  300. // get the first horizontal line done ;
  301. movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  302. movdqa xmm4, xmm3 ; make a copy of current line
  303. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  304. psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
  305. pmullw xmm3, xmm1 ;
  306. punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
  307. pmullw xmm4, xmm2 ;
  308. paddw xmm3, xmm4 ;
  309. paddw xmm3, rd ;
  310. psraw xmm3, FILTER_SHIFT ; ready for output
  311. movdqa xmm5, xmm3 ;
  312. add esi, edx ; next line
  313. NextRow:
  314. pmullw xmm5, [eax] ;
  315. movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  316. movdqa xmm4, xmm3 ; make a copy of current line
  317. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  318. psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
  319. pmullw xmm3, xmm1 ;
  320. punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
  321. movdqa xmm6, xmm5 ;
  322. pmullw xmm4, xmm2 ;
  323. paddw xmm3, xmm4 ;
  324. paddw xmm3, rd ;
  325. psraw xmm3, FILTER_SHIFT ; ready for output
  326. movdqa xmm5, xmm3 ; make a copy for the next row
  327. pmullw xmm3, [eax+16] ;
  328. paddw xmm6, xmm3 ;
  329. paddw xmm6, rd ; xmm6 += round value
  330. psraw xmm6, FILTER_SHIFT ; xmm6 /= 128
  331. movdqu [edi], xmm6;
  332. /*
  333. packuswb xmm6, xmm0 ; pack and unpack to saturate
  334. movdq2q mm0, xmm6
  335. movq [edi], mm0 ; store the results in the destination
  336. */
  337. add esi, edx ; next line
  338. add edi, 16 ;
  339. cmp edi, ecx ;
  340. jne NextRow
  341. }
  342. // First filter 1d Horizontal
  343. //FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
  344. // Now filter Verticaly
  345. //FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
  346. }
  347. _inline
  348. void FilterUnpackBlock1d_hb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  349. {
  350. __asm
  351. {
  352. mov edi, Filter
  353. movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
  354. movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
  355. mov edi,OutputPtr
  356. mov esi,SrcPtr
  357. mov ecx, DWORD PTR OutputHeight
  358. mov eax, OutputWidth ; destination pitch?
  359. pxor xmm0, xmm0 ; xmm0 = 00000000
  360. nextrow:
  361. movdqu xmm3, [esi] ; xmm3 = p-1..p14
  362. movdqu xmm5, xmm3 ; xmm4 = p-1..p14
  363. punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
  364. pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
  365. psrldq xmm5, 1 ; xmm4 = p0..p13
  366. punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
  367. pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
  368. paddw xmm3, xmm5 ; xmm3 += xmm5
  369. paddw xmm3, rd ; xmm3 += round value
  370. psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
  371. /*
  372. packuswb xmm3, xmm0 ; pack and unpack to saturate
  373. movdq2q mm0, xmm3
  374. */
  375. movdqu [edi],xmm3 ; store the results in the destination
  376. add esi,SrcPixelsPerLine ; next line
  377. add edi,eax;
  378. dec ecx ; decrement count
  379. jnz nextrow ; next row
  380. }
  381. }
  382. _inline
  383. void FilterUnpackBlock1d_vb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
  384. {
  385. __asm
  386. {
  387. mov edi, Filter
  388. movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
  389. movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
  390. mov edx, PixelsPerLine
  391. mov edi, OutputPtr
  392. mov esi, SrcPtr
  393. mov ecx, DWORD PTR OutputHeight
  394. mov eax, OutputWidth ; destination pitch?
  395. pxor xmm0, xmm0 ; xmm0 = 00000000
  396. nextrow:
  397. movdqu xmm3, [esi] ; xmm3 = p0..p16
  398. punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
  399. pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
  400. movdqu xmm4, [esi +edx ] ; xmm4 = p0..p16
  401. punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
  402. pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
  403. paddw xmm3, xmm4 ; xmm3 += xmm4
  404. paddw xmm3, rd ; xmm3 += round value
  405. psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
  406. /*packuswb xmm3, xmm0 ; pack and unpack to saturate
  407. movdq2q mm0, xmm3
  408. */
  409. movdqu [edi],xmm3 ; store the results in the destination
  410. // the subsequent iterations repeat 3 out of 4 of these reads. Since the
  411. // recon block should be in cache this shouldn't cost much. Its obviously
  412. // avoidable!!!.
  413. add esi,edx
  414. add edi,eax
  415. dec ecx ; decrement count
  416. jnz nextrow ; next row
  417. }
  418. }
  419. /****************************************************************************
  420. *
  421. * ROUTINE : FilterBlockBil_8
  422. *
  423. * INPUTS : ReconPtr1, ReconPtr12
  424. * Two pointers into the block of data to be filtered
  425. * These pointers bound the fractional pel position
  426. * PixelsPerLine
  427. * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
  428. * Modx, ModY
  429. * The fractional pel bits used to select a filter.
  430. *
  431. *
  432. * OUTPUTS : ReconRefPtr
  433. * A pointer to an 8x8 buffer into which UINT8 filtered data is written.
  434. *
  435. * RETURNS : None.
  436. *
  437. * FUNCTION : Produces a bilinear filtered fractional pel prediction block
  438. * with UINT8 output
  439. *
  440. * SPECIAL NOTES :
  441. *
  442. * ERRORS : None.
  443. *
  444. ****************************************************************************/
  445. void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
  446. {
  447. int diff;
  448. // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
  449. diff=ReconPtr2-ReconPtr1;
  450. // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
  451. // This works out to be what we want... despite the pointer swapping that goes on below.
  452. // For example... if the X component of the vector is a +ve ModX = X%8.
  453. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
  454. if(diff<0)
  455. { // swap pointers so ReconPtr1 smaller
  456. UINT8 *temp=ReconPtr1;
  457. ReconPtr1=ReconPtr2;
  458. ReconPtr2=temp;
  459. diff= (int)(ReconPtr2-ReconPtr1);
  460. }
  461. if( diff==1 )
  462. {
  463. FilterBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
  464. }
  465. else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
  466. {
  467. FilterBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
  468. }
  469. else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
  470. {
  471. FilterBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  472. //FilterBlock2dBil_8_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  473. }
  474. else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
  475. {
  476. FilterBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  477. //FilterBlock2dBil_8_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  478. }
  479. }
  480. _inline void UnpackBlock_wmt( UINT8 *SrcPtr, UINT16 *OutputPtr, UINT32 SrcPixelsPerLine )
  481. {
  482. __asm
  483. {
  484. mov edi,OutputPtr
  485. mov esi,SrcPtr
  486. mov ecx, 8
  487. mov eax, 16 ; destination pitch?
  488. pxor xmm0, xmm0 ; xmm0 = 00000000
  489. nextrow:
  490. movdqu xmm3, [esi] ; xmm3 = p-1..p14
  491. punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
  492. movdqu [edi],xmm3 ; store the results in the destination
  493. add esi,SrcPixelsPerLine ; next line
  494. add edi,eax;
  495. dec ecx ; decrement count
  496. jnz nextrow ; next row
  497. }
  498. }
  499. /****************************************************************************
  500. *
  501. * ROUTINE : FilterBlock2d
  502. *
  503. * INPUTS : Pointer to source data
  504. *
  505. * OUTPUTS : Filtered data
  506. *
  507. * RETURNS : None.
  508. *
  509. * FUNCTION : Applies a 2d 4 tap filter on the intput data to produce
  510. * a predictor block (UINT16)
  511. *
  512. * SPECIAL NOTES :
  513. *
  514. * ERRORS : None.
  515. *
  516. ****************************************************************************/
  517. void FilterBlock2d_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
  518. {
  519. UINT8 Intermediate[256];
  520. // First filter 1d Horizontal
  521. FilterBlock1d_h_wmt(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );
  522. // Now filter Verticaly
  523. FilterBlock1d_v_wmt(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
  524. }
  525. /****************************************************************************
  526. *
  527. * ROUTINE : FilterBlock
  528. *
  529. * INPUTS : ReconPtr1, ReconPtr12
  530. * Two pointers into the block of data to be filtered
  531. * These pointers bound the fractional pel position
  532. * PixelsPerLine
  533. * Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
  534. * Modx, ModY
  535. * The fractional pel bits used to select a filter.
  536. * UseBicubic
  537. * Whether to use the bicubuc filter set or the bilinear set
  538. *
  539. *
  540. * OUTPUTS : ReconRefPtr
  541. * A pointer to an 8x8 buffer into which the filtered data is written.
  542. *
  543. * RETURNS : None.
  544. *
  545. * FUNCTION : Produces a filtered fractional pel prediction block
  546. * using bilinear or bicubic filters
  547. *
  548. * SPECIAL NOTES :
  549. *
  550. * ERRORS : None.
  551. *
  552. ****************************************************************************/
  553. void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
  554. {
  555. int diff;
  556. UINT8 Intermediate[256];
  557. // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
  558. diff=ReconPtr2-ReconPtr1;
  559. // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
  560. // This works out to be what we want... despite the pointer swapping that goes on below.
  561. // For example... if the X component of the vector is a +ve ModX = X%8.
  562. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
  563. if(diff<0)
  564. { // swap pointers so ReconPtr1 smaller
  565. UINT8 *temp=ReconPtr1;
  566. ReconPtr1=ReconPtr2;
  567. ReconPtr2=temp;
  568. diff= (int)(ReconPtr2-ReconPtr1);
  569. }
  570. if(!diff)
  571. {
  572. return;
  573. }
  574. if(UseBicubic)
  575. {
  576. if( diff==1 )
  577. { // Fractional pixel in horizontal only
  578. FilterBlock1d_h_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
  579. }
  580. else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
  581. {
  582. FilterBlock1d_v_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
  583. }
  584. else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
  585. {
  586. FilterBlock2d_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
  587. }
  588. else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
  589. {
  590. FilterBlock2d_wmt( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
  591. }
  592. UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
  593. }
  594. else
  595. {
  596. if( diff==1 )
  597. {
  598. FilterUnpackBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 16, BilinearFilters_wmt[ModX] );
  599. // Fractional pixel in horizontal only
  600. /*
  601. FilterBlock1d_hb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
  602. UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
  603. */
  604. }
  605. else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
  606. {
  607. FilterUnpackBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 16, BilinearFilters_wmt[ModY]);
  608. /*
  609. FilterBlock1d_vb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
  610. UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
  611. */
  612. }
  613. else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
  614. {
  615. FilterUnpackBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  616. /*
  617. FilterBlock2dBil_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  618. UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
  619. */
  620. }
  621. else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
  622. {
  623. FilterUnpackBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  624. /*
  625. FilterBlock2dBil_wmt( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
  626. UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
  627. */
  628. }
  629. }
  630. }