deringwmtopt.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. /****************************************************************************
  2. *
  3. * Module Title : DeRingingWmtOpt.c
  4. *
  5. * Description : Optimized functions for PostProcessor
  6. *
  7. ***************************************************************************/
  8. #define STRICT /* Strict type checking */
  9. /****************************************************************************
  10. * Header Files
  11. ****************************************************************************/
  12. #include "postp.h"
  13. /****************************************************************************
  14. * MAcros
  15. ****************************************************************************/
  16. #pragma warning(disable:4305)
  17. #pragma warning(disable:4731)
  18. /****************************************************************************
  19. * Module Statics
  20. ****************************************************************************/
  21. #if defined(_WIN32_WCE)
  22. #else
  23. __declspec(align(16)) static unsigned short eight128s []= { 128, 128, 128, 128, 128, 128, 128, 128};
  24. __declspec(align(16)) static unsigned short eight64s[] = { 64, 64, 64, 64, 64, 64, 64, 64};
  25. __declspec(align(16)) static char eight64c [] = { 64, 64, 64,64,64,64,64,64};
  26. __declspec(align(16)) static char eight32c [] = { 32,32,32,32,32,32,32,32};
  27. __declspec(align(16)) static char eight127c []= { 127, 127, 127, 127, 127, 127, 127, 127};
  28. __declspec(align(16)) static char eight128c []= { 128, 128, 128, 128, 128, 128, 128, 128};
  29. __declspec(align(16)) static unsigned char eight223c[] = { 223,223,223,223,223,223,223,223};
  30. __declspec(align(16)) static unsigned char eight231c[] = { 231,231,231,231,231,231,231,231};
  31. #endif
  32. /****************************************************************************
  33. * Imports
  34. ****************************************************************************/
  35. extern UINT32 SharpenModifier[];
  36. /****************************************************************************
  37. *
  38. * ROUTINE : DeRingBlockStrong_WMT
  39. *
  40. * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance.
  41. * const UINT8 *SrcPtr : Pointer to input image.
  42. * UINT8 *DstPtr : Pointer to output image.
  43. * const INT32 Pitch : Image stride.
  44. * UINT32 FragQIndex : Q-index block encoded with.
  45. * UINT32 *QuantScale : Array of quantization scale factors.
  46. *
  47. * OUTPUTS : None.
  48. *
  49. * RETURNS : void
  50. *
  51. * FUNCTION : Filtering a block for de-ringing purpose.
  52. *
  53. * SPECIAL NOTES : None.
  54. *
  55. ****************************************************************************/
  56. void DeringBlockStrong_WMT
  57. (
  58. const POSTPROC_INSTANCE *pbi,
  59. const UINT8 *SrcPtr,
  60. UINT8 *DstPtr,
  61. const INT32 Pitch,
  62. UINT32 FragQIndex,
  63. UINT32 *QuantScale
  64. )
  65. {
  66. #if defined(_WIN32_WCE)
  67. return;
  68. #else
  69. __declspec(align(16)) short UDMod[72];
  70. __declspec(align(16)) short LRMod[128];
  71. unsigned int PlaneLineStep = Pitch;
  72. const unsigned char *Src = SrcPtr;
  73. unsigned char *Des = DstPtr;
  74. short *UDPointer = UDMod;
  75. short *LRPointer = LRMod;
  76. UINT32 QStep = QuantScale[FragQIndex];
  77. INT32 Sharpen = SharpenModifier[FragQIndex];
  78. (void) pbi;
  79. __asm
  80. {
  81. push esi
  82. push edi
  83. mov esi, Src /* Source Pointer */
  84. mov edi, UDPointer /* UD modifier pointer */
  85. push ecx
  86. push edx
  87. mov ecx, PlaneLineStep /* Pitch Step */
  88. xor edx, edx
  89. push eax
  90. push ebx
  91. mov eax, QStep /* QValue */
  92. mov ebx, Sharpen /* Sharpen */
  93. movd mm0, eax /* QValue */
  94. movd mm2, ebx /* sharpen */
  95. push ebp
  96. punpcklbw mm0, mm0 /* 00 00 00 QQ */
  97. sub edx, ecx /* Negative Pitch */
  98. punpcklbw mm2, mm2 /* 00 00 00 SS */
  99. pxor mm7, mm7 /* clear mm7 for unpacks */
  100. punpcklbw mm0, mm0 /* 00 00 qq qq */
  101. mov eax, LRPointer /* Left and Right Modifier */
  102. punpcklbw mm2, mm2 /* 00 00 ss ss */
  103. lea ebx, [esi+ecx*8] /* Source Pointer of last row */
  104. punpcklbw mm0, mm0 /* qq qq qq qq */
  105. movq mm1, mm0; /* make a copy */
  106. punpcklbw mm2, mm2 /* ss ss ss ss */
  107. paddb mm1, mm0 /* QValue * 2 */
  108. paddb mm1, mm0 /* High = 3 * Qvalue */
  109. paddusb mm1, eight223c /* clamping high to 32 */
  110. paddb mm0, eight32c /* 32+QValues */
  111. psubusb mm1, eight223c /* Get the real value back */
  112. movq mm3, eight127c /* 7f 7f 7f 7f 7f 7f 7f 7f */
  113. pandn mm1, mm3 /* ClampHigh */
  114. /* mm0,mm1,mm2,mm7 are in use */
  115. /* mm0---> QValue+32 */
  116. /* mm1---> ClampHigh */
  117. /* mm2---> Sharpen */
  118. /* mm7---> Cleared for unpack */
  119. FillModLoop1:
  120. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  121. pxor xmm7, xmm7 /* clear xmm7 */
  122. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  123. movq mm5, mm3 /* make a copy of p */
  124. psubusb mm3, mm4 /* p-pu */
  125. psubusb mm4, mm5 /* pu-p */
  126. por mm3, mm4 /* abs(p-pu) */
  127. movq mm6, mm0 /* 32+QValues */
  128. movq mm4, mm0 /* 32+QValues */
  129. psubusb mm6, mm3 /* zero clampled TmpMod */
  130. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  131. paddb mm4, eight64c /* 32+QValues + 64 */
  132. pxor mm4, mm5 /* convert to a sign number */
  133. pxor mm3, mm5 /* convert to a sign number */
  134. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  135. pand mm3, mm2 /* use sharpen */
  136. paddsb mm6, mm1 /* clamping to high */
  137. psubsb mm6, mm1 /* offset back */
  138. por mm6, mm3 /* Mod value to be stored */
  139. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  140. movq2dq xmm0, mm6
  141. movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */
  142. punpcklbw xmm7, xmm0 /* extended to words */
  143. movq mm5, mm3 /* make a copy of p */
  144. psraw xmm7, 8 /* sign extended */
  145. psubusb mm3, mm4 /* p-pu */
  146. movdqa [edi], xmm7 /* writeout UDmod*/
  147. psubusb mm4, mm5 /* pu-p */
  148. por mm3, mm4 /* abs(p-pu) */
  149. movq mm6, mm0 /* 32+QValues */
  150. movq mm4, mm0 /* 32+QValues */
  151. psubusb mm6, mm3 /* zero clampled TmpMod */
  152. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  153. paddb mm4, eight64c /* 32+QValues + 64 */
  154. pxor mm4, mm5 /* convert to a sign number */
  155. pxor mm3, mm5 /* convert to a sign number */
  156. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  157. pand mm3, mm2 /* use sharpen */
  158. paddsb mm6, mm1 /* clamping to high */
  159. psubsb mm6, mm1 /* offset back */
  160. por mm6, mm3 /* Mod value to be stored */
  161. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  162. pxor xmm7, xmm7 /* clear xmm7 */
  163. movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */
  164. movq2dq xmm0, mm6
  165. movq mm5, mm3 /* make a copy of p */
  166. punpcklbw xmm7, xmm0 /* extened to shorts */
  167. psubusb mm3, mm4 /* p-pu */
  168. psraw xmm7, 8 /* sign extended */
  169. psubusb mm4, mm5 /* pu-p */
  170. movdqa [eax], xmm7 /* writeout UDmod*/
  171. por mm3, mm4 /* abs(p-pu) */
  172. movq mm6, mm0 /* 32+QValues */
  173. pxor xmm7, xmm7 /* clear xmm7 */
  174. movq mm4, mm0 /* 32+QValues */
  175. psubusb mm6, mm3 /* zero clampled TmpMod */
  176. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  177. paddb mm4, eight64c /* 32+QValues + 64 */
  178. pxor mm4, mm5 /* convert to a sign number */
  179. pxor mm3, mm5 /* convert to a sign number */
  180. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  181. pand mm3, mm2 /* use sharpen */
  182. paddsb mm6, mm1 /* clamping to high */
  183. psubsb mm6, mm1 /* offset back */
  184. por mm6, mm3 /* Mod value to be stored */
  185. add esi, ecx
  186. movq2dq xmm0, mm6
  187. add edi, 16
  188. punpcklbw xmm7, mm0 /* extended to shorts */
  189. add eax, 16
  190. psraw xmm7, 8 /* sign extended */
  191. cmp esi, ebx
  192. movdqa [eax+112], xmm7 /* writeout UDmod*/
  193. jne FillModLoop1
  194. /* last UDMod */
  195. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  196. pxor xmm7, xmm7 /* clear xmm7 */
  197. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  198. movq mm5, mm3 /* make a copy of p */
  199. psubusb mm3, mm4 /* p-pu */
  200. psubusb mm4, mm5 /* pu-p */
  201. por mm3, mm4 /* abs(p-pu) */
  202. movq mm6, mm0 /* 32+QValues */
  203. movq mm4, mm0 /* 32+QValues */
  204. psubusb mm6, mm3 /* zero clampled TmpMod */
  205. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  206. paddb mm4, eight64c /* 32+QValues + 64 */
  207. pxor mm4, mm5 /* convert to a sign number */
  208. pxor mm3, mm5 /* convert to a sign number */
  209. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  210. pand mm3, mm2 /* use sharpen */
  211. paddsb mm6, mm1 /* clamping to high */
  212. psubsb mm6, mm1 /* offset back */
  213. por mm6, mm3 /* Mod value to be stored */
  214. movq2dq xmm6, mm6
  215. punpcklbw xmm7, xmm6 /* 03 xx 02 xx 01 xx 00 xx */
  216. psraw xmm7, 8 /* sign extended */
  217. movdqa [edi], xmm7 /* writeout UDmod */
  218. mov esi, Src
  219. mov edi, Des
  220. mov eax, UDPointer
  221. mov ebx, LRPointer
  222. mov ebp, 8
  223. FilterLoop1:
  224. movq xmm0, QWORD PTR [esi+edx] /* mm0 = Pixels above */
  225. pxor xmm7, xmm7 /* clear mm7 */
  226. movdqa xmm4, [eax] /* au */
  227. punpcklbw xmm0, xmm7 /* extended to shorts */
  228. movq xmm2, QWORD PTR [esi+ecx] /* mm2 = pixels below */
  229. pmullw xmm0, xmm4 /* pu*au */
  230. movdqa xmm6, [eax+16] /* ad */
  231. punpcklbw xmm2, xmm7 /* extened to shorts*/
  232. movq xmm1, QWORD PTR [esi-1] /* pixel to the left */
  233. pmullw xmm2, xmm6 /* ad*pd */
  234. movdqa xmm3, [ebx] /* al */
  235. punpcklbw xmm1, xmm7 /* extended to shorts */
  236. movq xmm5, QWORD PTR [esi+1] /* pixel to the right */
  237. pmullw xmm1, xmm3 /* al * pl */
  238. paddw xmm4, xmm6 /* au+ad */
  239. punpcklbw xmm5, xmm7 /* extends to shorts */
  240. movdqa xmm6, [ebx+128] /* ar */
  241. pmullw xmm5, xmm6 /* ar * pr */
  242. paddw xmm0, xmm2 /* au*pu + ad*pd */
  243. paddw xmm4, xmm3 /* au+ad+al */
  244. paddw xmm0, xmm1 /* au*pu+ad*pd+al*pl */
  245. paddw xmm4, xmm6 /* au+ad+al+ar */
  246. movq xmm2, QWORD PTR [esi] /* p */
  247. paddw xmm0, xmm5 /* au*pu+ad*pd+al*pl+ar*pr */
  248. /* xmm0 --- au*pu+ad*pd+al*pl+ar*pr */
  249. /* xmm4 --- au + ad + al + ar */
  250. movdqa xmm1, eight128s /* 0080 0080 0080 0080 0080 0080 0080 0080 */
  251. punpcklbw xmm2, xmm7 /* extended to shorts */
  252. psubw xmm1, xmm4 /* 128-(au+ad+al+ar) */
  253. pmullw xmm2, xmm1 /* p*(128-(au+ad+al+ar)) */
  254. add esi, ecx /* Src += Pitch */
  255. movdqa xmm6, eight64s /* 64, 64, 64, 64, 64, 64, 64, 64 */
  256. movdqa xmm7, xmm6 /* 64, 64, 64, 64, 64, 64, 64, 64 */
  257. add eax, 16 /* UDPointer += 8 */
  258. psllw xmm7, 8 /* {16384, .. } */
  259. paddw xmm0, xmm2 /* sum */
  260. add edi, ecx /* Des += Pitch */
  261. paddw xmm0, xmm6 /* sum+B */
  262. add ebx, 16 /* LPointer +=8 */
  263. paddw xmm0, xmm7 /* clamping */
  264. psubusw xmm0, xmm7 /* clamping */
  265. dec ebp
  266. psrlw xmm0, 7 /* (sum+B)>>7 */
  267. packuswb xmm0, xmm7 /* pack to 8 bytes */
  268. movq QWORD PTR [edi+edx], xmm0 /* write to destination */
  269. jnz FilterLoop1
  270. pop ebp
  271. pop ebx
  272. pop eax
  273. pop edx
  274. pop ecx
  275. pop edi
  276. pop esi
  277. }
  278. #endif
  279. }
  280. /****************************************************************************
  281. *
  282. * ROUTINE : DeRingBlockWeak_WMT
  283. *
  284. * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance.
  285. * const UINT8 *SrcPtr : Pointer to input image.
  286. * UINT8 *DstPtr : Pointer to output image.
  287. * const INT32 Pitch : Image stride.
  288. * UINT32 FragQIndex : Q-index block encoded with.
  289. * UINT32 *QuantScale : Array of quantization scale factors.
  290. *
  291. * OUTPUTS : None.
  292. *
  293. * RETURNS : void
  294. *
  295. * FUNCTION : Filtering a block for de-ringing purpose.
  296. *
  297. * SPECIAL NOTES : None.
  298. *
  299. ****************************************************************************/
  300. void DeringBlockWeak_WMT
  301. (
  302. const POSTPROC_INSTANCE *pbi,
  303. const UINT8 *SrcPtr,
  304. UINT8 *DstPtr,
  305. const INT32 Pitch,
  306. UINT32 FragQIndex,
  307. UINT32 *QuantScale
  308. )
  309. {
  310. #if defined(_WIN32_WCE)
  311. return;
  312. #else
  313. __declspec(align(16)) short UDMod[72];
  314. __declspec(align(16)) short LRMod[128];
  315. unsigned int PlaneLineStep = Pitch;
  316. const unsigned char *Src = SrcPtr;
  317. unsigned char *Des = DstPtr;
  318. short *UDPointer = UDMod;
  319. short *LRPointer = LRMod;
  320. UINT32 QStep = QuantScale[FragQIndex];
  321. INT32 Sharpen = SharpenModifier[FragQIndex];
  322. (void) pbi;
  323. __asm
  324. {
  325. push esi
  326. push edi
  327. mov esi, Src /* Source Pointer */
  328. mov edi, UDPointer /* UD modifier pointer */
  329. push ecx
  330. push edx
  331. mov ecx, PlaneLineStep /* Pitch Step */
  332. xor edx, edx
  333. push eax
  334. push ebx
  335. mov eax, QStep /* QValue */
  336. mov ebx, Sharpen /* Sharpen */
  337. movd mm0, eax /* QValue */
  338. movd mm2, ebx /* sharpen */
  339. push ebp
  340. punpcklbw mm0, mm0 /* 00 00 00 QQ */
  341. sub edx, ecx /* Negative Pitch */
  342. punpcklbw mm2, mm2 /* 00 00 00 SS */
  343. pxor mm7, mm7 /* clear mm7 for unpacks */
  344. punpcklbw mm0, mm0 /* 00 00 qq qq */
  345. mov eax, LRPointer /* Left and Right Modifier */
  346. punpcklbw mm2, mm2 /* 00 00 ss ss */
  347. lea ebx, [esi+ecx*8] /* Source Pointer of last row */
  348. punpcklbw mm0, mm0 /* qq qq qq qq */
  349. movq mm1, mm0; /* make a copy */
  350. punpcklbw mm2, mm2 /* ss ss ss ss */
  351. paddb mm1, mm0 /* QValue * 2 */
  352. paddb mm1, mm0 /* High = 3 * Qvalue */
  353. paddusb mm1, eight231c /* clamping high to 24 */
  354. paddb mm0, eight32c /* 32+QValues */
  355. psubusb mm1, eight231c /* Get the real value back */
  356. movq mm3, eight127c /* 7f 7f 7f 7f 7f 7f 7f 7f */
  357. pandn mm1, mm3 /* ClampHigh */
  358. /* mm0,mm1,mm2,mm7 are in use */
  359. /* mm0---> QValue+32 */
  360. /* mm1---> ClampHigh */
  361. /* mm2---> Sharpen */
  362. /* mm7---> Cleared for unpack */
  363. FillModLoop1:
  364. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  365. pxor xmm7, xmm7 /* clear xmm7 */
  366. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  367. movq mm5, mm3 /* make a copy of p */
  368. psubusb mm3, mm4 /* p-pu */
  369. psubusb mm4, mm5 /* pu-p */
  370. por mm3, mm4 /* abs(p-pu) */
  371. movq mm6, mm0 /* 32+QValues */
  372. paddusb mm3, mm3 /* 2*abs(p-pu) */
  373. movq mm4, mm0 /* 32+QValues */
  374. psubusb mm6, mm3 /* zero clampled TmpMod */
  375. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  376. paddb mm4, eight64c /* 32+QValues + 64 */
  377. pxor mm4, mm5 /* convert to a sign number */
  378. pxor mm3, mm5 /* convert to a sign number */
  379. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  380. pand mm3, mm2 /* use sharpen */
  381. paddsb mm6, mm1 /* clamping to high */
  382. psubsb mm6, mm1 /* offset back */
  383. por mm6, mm3 /* Mod value to be stored */
  384. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  385. movq2dq xmm0, mm6
  386. movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */
  387. punpcklbw xmm7, xmm0 /* extended to words */
  388. movq mm5, mm3 /* make a copy of p */
  389. psraw xmm7, 8 /* sign extended */
  390. psubusb mm3, mm4 /* p-pu */
  391. movdqa [edi], xmm7 /* writeout UDmod*/
  392. psubusb mm4, mm5 /* pu-p */
  393. por mm3, mm4 /* abs(p-pu) */
  394. movq mm6, mm0 /* 32+QValues */
  395. paddusb mm3, mm3 /* 2*abs(p-pu) */
  396. movq mm4, mm0 /* 32+QValues */
  397. psubusb mm6, mm3 /* zero clampled TmpMod */
  398. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  399. paddb mm4, eight64c /* 32+QValues + 64 */
  400. pxor mm4, mm5 /* convert to a sign number */
  401. pxor mm3, mm5 /* convert to a sign number */
  402. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  403. pand mm3, mm2 /* use sharpen */
  404. paddsb mm6, mm1 /* clamping to high */
  405. psubsb mm6, mm1 /* offset back */
  406. por mm6, mm3 /* Mod value to be stored */
  407. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  408. pxor xmm7, xmm7 /* clear xmm7 */
  409. movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */
  410. movq2dq xmm0, mm6
  411. movq mm5, mm3 /* make a copy of p */
  412. punpcklbw xmm7, xmm0 /* extened to shorts */
  413. psubusb mm3, mm4 /* p-pu */
  414. psraw xmm7, 8 /* sign extended */
  415. psubusb mm4, mm5 /* pu-p */
  416. movdqa [eax], xmm7 /* writeout UDmod*/
  417. por mm3, mm4 /* abs(p-pu) */
  418. movq mm6, mm0 /* 32+QValues */
  419. paddusb mm3, mm3 /* 2*abs(p-pu) */
  420. pxor xmm7, xmm7 /* clear xmm7 */
  421. movq mm4, mm0 /* 32+QValues */
  422. psubusb mm6, mm3 /* zero clampled TmpMod */
  423. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  424. paddb mm4, eight64c /* 32+QValues + 64 */
  425. pxor mm4, mm5 /* convert to a sign number */
  426. pxor mm3, mm5 /* convert to a sign number */
  427. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  428. pand mm3, mm2 /* use sharpen */
  429. paddsb mm6, mm1 /* clamping to high */
  430. psubsb mm6, mm1 /* offset back */
  431. por mm6, mm3 /* Mod value to be stored */
  432. add esi, ecx
  433. movq2dq xmm0, mm6
  434. add edi, 16
  435. punpcklbw xmm7, mm0 /* extended to shorts */
  436. add eax, 16
  437. psraw xmm7, 8 /* sign extended */
  438. cmp esi, ebx
  439. movdqa [eax+112], xmm7 /* writeout UDmod*/
  440. jne FillModLoop1
  441. /* last UDMod */
  442. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  443. pxor xmm7, xmm7 /* clear xmm7 */
  444. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  445. movq mm5, mm3 /* make a copy of p */
  446. psubusb mm3, mm4 /* p-pu */
  447. psubusb mm4, mm5 /* pu-p */
  448. por mm3, mm4 /* abs(p-pu) */
  449. movq mm6, mm0 /* 32+QValues */
  450. paddusb mm3, mm3 /* 2*abs(p-pu) */
  451. movq mm4, mm0 /* 32+QValues */
  452. psubusb mm6, mm3 /* zero clampled TmpMod */
  453. movq mm5, eight128c /* 80 80 80 80 80 80 80 80 */
  454. paddb mm4, eight64c /* 32+QValues + 64 */
  455. pxor mm4, mm5 /* convert to a sign number */
  456. pxor mm3, mm5 /* convert to a sign number */
  457. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  458. pand mm3, mm2 /* use sharpen */
  459. paddsb mm6, mm1 /* clamping to high */
  460. psubsb mm6, mm1 /* offset back */
  461. por mm6, mm3 /* Mod value to be stored */
  462. movq2dq xmm6, mm6
  463. punpcklbw xmm7, xmm6 /* 03 xx 02 xx 01 xx 00 xx */
  464. psraw xmm7, 8 /* sign extended */
  465. movdqa [edi], xmm7 /* writeout UDmod */
  466. mov esi, Src
  467. mov edi, Des
  468. mov eax, UDPointer
  469. mov ebx, LRPointer
  470. mov ebp, 8
  471. FilterLoop1:
  472. movq xmm0, QWORD PTR [esi+edx] /* mm0 = Pixels above */
  473. pxor xmm7, xmm7 /* clear mm7 */
  474. movdqa xmm4, [eax] /* au */
  475. punpcklbw xmm0, xmm7 /* extended to shorts */
  476. movq xmm2, QWORD PTR [esi+ecx] /* mm2 = pixels below */
  477. pmullw xmm0, xmm4 /* pu*au */
  478. movdqa xmm6, [eax+16] /* ad */
  479. punpcklbw xmm2, xmm7 /* extened to shorts*/
  480. movq xmm1, QWORD PTR [esi-1] /* pixel to the left */
  481. pmullw xmm2, xmm6 /* ad*pd */
  482. movdqa xmm3, [ebx] /* al */
  483. punpcklbw xmm1, xmm7 /* extended to shorts */
  484. movq xmm5, QWORD PTR [esi+1] /* pixel to the right */
  485. pmullw xmm1, xmm3 /* al * pl */
  486. paddw xmm4, xmm6 /* au+ad */
  487. punpcklbw xmm5, xmm7 /* extends to shorts */
  488. movdqa xmm6, [ebx+128] /* ar */
  489. pmullw xmm5, xmm6 /* ar * pr */
  490. paddw xmm0, xmm2 /* au*pu + ad*pd */
  491. paddw xmm4, xmm3 /* au+ad+al */
  492. paddw xmm0, xmm1 /* au*pu+ad*pd+al*pl */
  493. paddw xmm4, xmm6 /* au+ad+al+ar */
  494. movq xmm2, QWORD PTR [esi] /* p */
  495. paddw xmm0, xmm5 /* au*pu+ad*pd+al*pl+ar*pr */
  496. /* xmm0 --- au*pu+ad*pd+al*pl+ar*pr */
  497. /* xmm4 --- au + ad + al + ar */
  498. movdqa xmm1, eight128s /* 0080 0080 0080 0080 0080 0080 0080 0080 */
  499. punpcklbw xmm2, xmm7 /* extended to shorts */
  500. psubw xmm1, xmm4 /* 128-(au+ad+al+ar) */
  501. pmullw xmm2, xmm1 /* p*(128-(au+ad+al+ar)) */
  502. add esi, ecx /* Src += Pitch */
  503. movdqa xmm6, eight64s /* 64, 64, 64, 64, 64, 64, 64, 64 */
  504. movdqa xmm7, xmm6 /* 64, 64, 64, 64, 64, 64, 64, 64 */
  505. add eax, 16 /* UDPointer += 8 */
  506. psllw xmm7, 8 /* {16384, .. } */
  507. paddw xmm0, xmm2 /* sum */
  508. add edi, ecx /* Des += Pitch */
  509. paddw xmm0, xmm6 /* sum+B */
  510. add ebx, 16 /* LPointer +=8 */
  511. paddw xmm0, xmm7 /* clamping */
  512. psubusw xmm0, xmm7 /* clamping */
  513. dec ebp
  514. psrlw xmm0, 7 /* (sum+B)>>7 */
  515. packuswb xmm0, xmm7 /* pack to 8 bytes */
  516. movq QWORD PTR [edi+edx], xmm0 /* write to destination */
  517. jnz FilterLoop1
  518. pop ebp
  519. pop ebx
  520. pop eax
  521. pop edx
  522. pop ecx
  523. pop edi
  524. pop esi
  525. }
  526. #endif
  527. }