deblockwmtopt.c 96 KB


  1. /****************************************************************************
  2. *
  3. * Module Title : DeblockwmtOpt.c
  4. *
  5. * Description : Optimized functions for deblocking
  6. *
  7. * AUTHOR : Yaowu Xu
  8. *
  9. *****************************************************************************
  10. * Revision History
  11. *
  12. * 1.02 YWX 08-Dec-00 Configuration baseline from deblockopt.c
  13. *
  14. *****************************************************************************
  15. */
  16. /****************************************************************************
  17. * Header Frames
  18. *****************************************************************************
  19. */
  20. #include "postp.h"
  21. #include "stdlib.h"
  22. #include <math.h>
  23. /****************************************************************************
  24. * Module constants.
  25. *****************************************************************************
  26. */
  27. #if defined(_WIN32_WCE)
  28. #else
  29. __declspec(align(16)) static short Eight128s[] = {128, 128, 128, 128,128, 128, 128, 128 };
  30. __declspec(align(16)) static short Eight64s[] = {64, 64, 64, 64, 64, 64, 64, 64 };
  31. __declspec(align(16)) static short EightThrees[]= {3, 3, 3, 3, 3, 3, 3, 3};
  32. __declspec(align(16)) static short EightFours[]= {4, 4, 4, 4, 4, 4, 4, 4};
  33. __declspec(align(16)) static short Four128s[] = {128, 128, 128, 128};
  34. __declspec(align(16)) static short Four64s[] = {64, 64, 64, 64 };
  35. __declspec(align(16)) static short FourThrees[]= {3, 3, 3, 3};
  36. __declspec(align(16)) static short FourFours[]= {4, 4, 4, 4};
  37. __declspec(align(16)) static short EightOnes[]= { 1, 1, 1, 1, 1, 1, 1, 1};
  38. #endif
  39. /****************************************************************************
  40. * Explicit Imports
  41. *****************************************************************************
  42. */
  43. extern double gaussian(double sigma, double mu, double x);
  44. extern UINT32 *DeblockLimitValuesV2;
  45. /****************************************************************************
  46. * Exported Global Variables
  47. *****************************************************************************
  48. */
  49. /****************************************************************************
  50. * Exported Functions
  51. *****************************************************************************
  52. */
  53. /****************************************************************************
  54. * Module Statics
  55. *****************************************************************************
  56. */
  57. /****************************************************************************
  58. *
  59. * ROUTINE : DeblockLoopFilteredBand_MMX
  60. *
  61. * INPUTS : None
  62. *
  63. * OUTPUTS : None
  64. *
  65. * RETURNS : None
  66. *
  67. * FUNCTION : Filter both horizontal and vertical edge in a band
  68. *
  69. * SPECIAL NOTES :
  70. *
  71. * REFERENCE :
  72. *
  73. * ERRORS : None.
  74. *
  75. ****************************************************************************/
  76. void DeblockLoopFilteredBand_WMT(
  77. POSTPROC_INSTANCE *pbi,
  78. UINT8 *SrcPtr,
  79. UINT8 *DesPtr,
  80. UINT32 PlaneLineStep,
  81. UINT32 FragAcross,
  82. UINT32 StartFrag,
  83. UINT32 *QuantScale
  84. )
  85. {
  86. UINT32 j;
  87. UINT32 CurrentFrag=StartFrag;
  88. UINT32 QStep;
  89. UINT8 *Src, *Des;
  90. UINT32 Var1, Var2;
  91. #if defined(_WIN32_WCE)
  92. return;
  93. #else
  94. __declspec(align(16)) short QStepWMT[8];
  95. __declspec(align(16)) short FLimitWMT[8];
  96. __declspec(align(16)) short Rows[80];
  97. __declspec(align(16)) unsigned short Variance1[8];
  98. __declspec(align(16)) unsigned short Variance2[8];
  99. Src=SrcPtr;
  100. Des=DesPtr;
  101. while(CurrentFrag < StartFrag + FragAcross )
  102. {
  103. QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
  104. if( QStep > 3 )
  105. {
  106. QStepWMT[0] = (INT16)QStep;
  107. QStepWMT[1] = (INT16)QStep;
  108. QStepWMT[2] = (INT16)QStep;
  109. QStepWMT[3] = (INT16)QStep;
  110. QStepWMT[4] = (INT16)QStep;
  111. QStepWMT[5] = (INT16)QStep;
  112. QStepWMT[6] = (INT16)QStep;
  113. QStepWMT[7] = (INT16)QStep;
  114. __asm
  115. {
  116. /* Save the registers */
  117. push eax
  118. push ecx
  119. push edx
  120. push esi
  121. push edi
  122. /* Calculate the FLimit and store FLimit and QStep */
  123. movdqa xmm0, QStepWMT /* xmm0 = QStep */
  124. movdqa xmm1, EightThrees /* mm1 = 03030303 */
  125. pmullw xmm1, xmm0 /* mm1 = QStep * 3 */
  126. pmullw xmm1, xmm0 /* mm1 = QStep * QStep * 3 */
  127. psrlw xmm1, 5 /* mm1 = FLimit */
  128. movdqa [FLimitWMT], xmm1 /* Save FLimit */
  129. /* setup the pointers */
  130. mov eax, Src /* eax = Src */
  131. xor edx, edx /* clear edx */
  132. mov esi, Des /* esi = Des */
  133. lea edi, Rows /* edi = Rows */
  134. mov ecx, PlaneLineStep /* ecx = Pitch */
  135. pxor xmm7, xmm7 /* Clear xmm7 */
  136. sub edx, ecx /* edx = -Pitch */
  137. lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
  138. lea esi, [esi + edx * 2 ] /* esi = Des - 2 * Pitch */
  139. /* Copy the data to the intermediate buffer */
  140. movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-5*Pitch] */
  141. movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[-4*Pitch */
  142. punpcklbw xmm0, xmm7 /* expand to words */
  143. punpcklbw xmm1, xmm7 /* expand to words */
  144. movdqa [edi], xmm0 /* write 8 words */
  145. movdqa [edi+16], xmm1 /* write 8 words */
  146. movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[-3*Pitch] */
  147. movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[-2*Pitch] */
  148. punpcklbw xmm2, xmm7 /* expand to words */
  149. punpcklbw xmm3, xmm7 /* expand to words */
  150. movdqa [edi+32], xmm2 /* write 8 words */
  151. movdqa [edi+48], xmm3 /* write 8 words */
  152. lea eax, [eax+ecx*4] /* eax= Src */
  153. movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-Pitch] */
  154. movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[0] */
  155. punpcklbw xmm0, xmm7 /* expand to words */
  156. punpcklbw xmm1, xmm7 /* expand to words */
  157. movdqa [edi+64], xmm0 /* write 8 words */
  158. movdqa [edi+80], xmm1 /* write 8 words */
  159. movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[Pitch] */
  160. movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[2*Pitch] */
  161. punpcklbw xmm2, xmm7 /* expand to words */
  162. punpcklbw xmm3, xmm7 /* expand to words */
  163. movdqa [edi+96], xmm2 /* write 8 words */
  164. movdqa [edi+112], xmm3 /* write 8 words */
  165. lea eax, [eax+ecx*4] /* eax= Src+4*Pitch */
  166. movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[3*Pitch] */
  167. movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[4*Pitch] */
  168. punpcklbw xmm0, xmm7 /* expand to words */
  169. punpcklbw xmm1, xmm7 /* expand to words */
  170. movdqa [edi+128], xmm0 /* write 8 words */
  171. movdqa [edi+144], xmm1 /* write 8 words */
  172. /* done with copying everything to intermediate buffer */
  173. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  174. /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
  175. /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
  176. pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
  177. psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
  178. psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
  179. movdqa xmm2, [edi+16] /* Pixel 1 */
  180. movdqa xmm6, [edi+80] /* Pixel 5 */
  181. psubw xmm2, xmm3 /* xmm2 -=128 */
  182. psubw xmm6, xmm3 /* xmm6 -=128 */
  183. movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
  184. movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
  185. pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
  186. pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
  187. movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
  188. movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
  189. movdqa xmm2, [edi+32] /* Pixel 2 */
  190. movdqa xmm6, [edi+96] /* Pixel 6 */
  191. psubw xmm2, xmm3 /* xmm2 -=128 */
  192. psubw xmm6, xmm3 /* xmm6 -=128 */
  193. paddw xmm0, xmm2 /* xmm0 += pixel 2 */
  194. paddw xmm4, xmm6 /* xmm4 += pixel 6 */
  195. pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
  196. pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
  197. paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
  198. paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
  199. movdqa xmm2, [edi+48] /* Pixel 3 */
  200. movdqa xmm6, [edi+112] /* Pixel 7 */
  201. psubw xmm2, xmm3 /* xmm2 -=128 */
  202. psubw xmm6, xmm3 /* xmm6 -=128 */
  203. paddw xmm0, xmm2 /* xmm0 += pixel 3 */
  204. paddw xmm4, xmm6 /* xmm4 += pixel 7 */
  205. pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
  206. pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
  207. paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
  208. paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
  209. movdqa xmm2, [edi+64] /* Pixel 4 */
  210. movdqa xmm6, [edi+128] /* Pixel 8 */
  211. psubw xmm2, xmm3 /* xmm2 -=128 */
  212. psubw xmm6, xmm3 /* xmm6 -=128 */
  213. paddw xmm0, xmm2 /* xmm0 += pixel 4 */
  214. paddw xmm4, xmm6 /* xmm4 += pixel 8 */
  215. pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
  216. pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
  217. paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
  218. paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
  219. /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  220. /* xmm1 = x1 + x2 + x3 + x4 */
  221. /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  222. /* xmm5 = x5 + x6 + x7 + x8 */
  223. movdqa xmm7, xmm3 /* xmm7 = xmm3 */
  224. psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
  225. movdqa xmm2, xmm0 /* make copy of sum1 */
  226. movdqa xmm6, xmm4 /* make copy of sum2 */
  227. paddw xmm0, xmm7 /* (sum1 + 1) */
  228. paddw xmm4, xmm7 /* (sum2 + 1) */
  229. psraw xmm2, 1 /* sum1 /2 */
  230. psraw xmm6, 1 /* sum2 /2 */
  231. psraw xmm0, 1 /* (sum1 + 1)/2 */
  232. psraw xmm4, 1 /* (sum2 + 1)/2 */
  233. pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
  234. pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
  235. psubw xmm1, xmm2 /* Variance 1 */
  236. psubw xmm5, xmm6 /* Variance 2 */
  237. movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
  238. movdqa xmm2, xmm1 /* copy of Varinace 1*/
  239. movdqa [Variance1], xmm1 /* save the varinace1 */
  240. movdqa [Variance2], xmm5 /* save the varinace2 */
  241. movdqa xmm6, xmm5 /* Variance 2 */
  242. psubw xmm1, xmm7 /* Variance 1 < Flimit? */
  243. psubw xmm5, xmm7 /* Variance 2 < Flimit? */
  244. psraw xmm2, 15 /* Variance 1 > 32768? */
  245. psraw xmm6, 15 /* Vaiance 2 > 32768? */
  246. psraw xmm1, 15 /* FFFF/0000 for true/false */
  247. psraw xmm5, 15 /* FFFF/0000 for true/false */
  248. movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
  249. pandn xmm2, xmm1 /* Variance1<32678 &&
  250. Variance1<Limit */
  251. pandn xmm6, xmm5 /* Variance2<32678 &&
  252. Variance1<Limit */
  253. movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
  254. pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
  255. /* &&Variance2 < Flimit */
  256. movdqa xmm2, xmm7 /* make copy of Pixel4 */
  257. psubusw xmm7, xmm4 /* 4 - 5 */
  258. psubusw xmm4, xmm2 /* 5 - 4 */
  259. por xmm7, xmm4 /* abs(4 - 5) */
  260. psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
  261. psraw xmm7, 15 /* FFFF/0000 for True/Flase */
  262. pand xmm7, xmm6
  263. /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  264. /* xmm7 now are in use */
  265. /* Let's do the filtering now */
  266. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  267. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  268. movdqa xmm5, [edi] /* xmm5 = -5 */
  269. movdqa xmm4, [edi + 16] /* xmm4 = -4 */
  270. movdqa xmm3, xmm4 /* copy of -4 */
  271. movdqa xmm6, xmm5 /* copy of -5 */
  272. psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
  273. psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
  274. por xmm4, xmm5 /* abs([-4]-[-5] ) */
  275. psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
  276. psraw xmm4, 15 /* FFFF/0000 for True/False */
  277. movdqa xmm1, xmm4 /* copy of the xmm4 */
  278. pand xmm4, xmm6 /* */
  279. pandn xmm1, xmm3 /* */
  280. por xmm1, xmm4 /* xmm1 = p1 */
  281. /* now find P2 */
  282. movdqa xmm4, [edi+128] /* xmm4 = [3] */
  283. movdqa xmm5, [edi+144] /* xmm5 = [4] */
  284. movdqa xmm3, xmm4 /* copy of 3 */
  285. movdqa xmm6, xmm5 /* copy of 4 */
  286. psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
  287. psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
  288. por xmm4, xmm5 /* abs([3]-[4] ) */
  289. psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
  290. psraw xmm4, 15 /* FFFF/0000 for True/False */
  291. movdqa xmm2, xmm4 /* copy of the xmm4 */
  292. pand xmm4, xmm6 /* */
  293. pandn xmm2, xmm3 /* */
  294. por xmm2, xmm4 /* xmm2 = p2 */
  295. /* Data is ready, now do the filtering */
  296. pxor xmm0, xmm0 /* clear xmm0 */
  297. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  298. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  299. /* Des[-w4] = Src[-w4]; */
  300. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  301. movdqa xmm3, xmm1 /* xmm3 = p1 */
  302. paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
  303. paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
  304. movdqa xmm4, [edi+16] /* xmm4 = x1 */
  305. paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
  306. paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
  307. paddw xmm3, [edi+64] /* xmm3 += x4 */
  308. paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
  309. paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
  310. movdqa xmm4, xmm3 /* xmm4 = xmm3 */
  311. movdqa xmm5, [edi+16] /* xmm5 = x1 */
  312. paddw xmm4, xmm5 /* xmm4 = sum+x1 */
  313. psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
  314. psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
  315. paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
  316. psraw xmm4, 4 /* xmm4 >>=4 */
  317. psubw xmm4, xmm5 /* New Value - old Value */
  318. pand xmm4, xmm7 /* And the flag */
  319. paddw xmm4, xmm5 /* add the old value back */
  320. packuswb xmm4, xmm0 /* pack it to bytes */
  321. movq QWORD PTR [esi+edx*2], xmm4 /* Write new x1 */
  322. /* sum += x5 -p1 */
  323. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  324. movdqa xmm5, [edi+32] /* xmm5= x2 */
  325. psubw xmm3, xmm1 /* sum=sum-p1 */
  326. paddw xmm3, [edi+80] /* sum=sum+x5 */
  327. movdqa xmm4, xmm5 /* copy sum */
  328. paddw xmm4, xmm3 /* xmm4=sum+x2 */
  329. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  330. psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
  331. paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
  332. psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
  333. psubw xmm4, xmm5 /* new value - old value */
  334. pand xmm4, xmm7 /* And the flag */
  335. paddw xmm4, xmm5 /* add the old value back */
  336. packuswb xmm4, xmm0 /* pack it to bytes */
  337. movq QWORD PTR [esi+edx], xmm4 /* write new x2 */
  338. /* sum += x6 - p1 */
  339. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  340. movdqa xmm5, [edi+48] /* xmm5= x3 */
  341. psubw xmm3, xmm1 /* sum=sum-p1 */
  342. paddw xmm3, [edi+96] /* sum=sum+x6 */
  343. movdqa xmm4, xmm5 /* copy x3 */
  344. paddw xmm4, xmm3 /* xmm4=sum+x3 */
  345. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  346. psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
  347. paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
  348. psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
  349. psubw xmm4, xmm5 /* new value - old value */
  350. pand xmm4, xmm7 /* And the flag */
  351. paddw xmm4, xmm5 /* add the old value back */
  352. packuswb xmm4, xmm0 /* pack it to bytes */
  353. movq QWORD PTR [esi],xmm4 /* write new x3 */
  354. /* sum += x7 - p1 */
  355. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  356. movdqa xmm5, [edi+64] /* xmm5 = x4 */
  357. psubw xmm3, xmm1 /* sum = sum-p1 */
  358. paddw xmm3, [edi+112] /* sum = sum+x7 */
  359. movdqa xmm4, xmm5 /* xmm4 = x4 */
  360. paddw xmm4, xmm3 /* xmm4 = sum + x4 */
  361. paddw xmm4, xmm4 /* xmm4 *=2 */
  362. paddw xmm4, xmm1 /* += p1 */
  363. psubw xmm4, [edi+16] /* -= x1 */
  364. psubw xmm4, [edi+112] /* -= x7 */
  365. paddw xmm4, [edi+128] /* += x8 */
  366. psraw xmm4, 4 /* >>=4 */
  367. psubw xmm4, xmm5 /* -=x4 */
  368. pand xmm4, xmm7 /* and flag */
  369. paddw xmm4, xmm5 /* += x4 */
  370. packuswb xmm4, xmm0 /* pack it to bytes */
  371. movq QWORD PTR [esi+ecx], xmm4 /* write new x4 */
  372. /* sum+= x8-x1 */
  373. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  374. movdqa xmm5, [edi+80] /* xmm5 = x5 */
  375. psubw xmm3, [edi+16] /* sum -= x1 */
  376. paddw xmm3, [edi+128] /* sub += x8 */
  377. movdqa xmm4, xmm5 /* xmm4 = x5 */
  378. paddw xmm4, xmm3 /* xmm4= sum+x5 */
  379. paddw xmm4, xmm4 /* xmm4 *= 2 */
  380. paddw xmm4, [edi+16] /* += x1 */
  381. psubw xmm4, [edi+32] /* -= x2 */
  382. psubw xmm4, [edi+128] /* -= x8 */
  383. paddw xmm4, xmm2 /* += p2 */
  384. psraw xmm4, 4 /* >>=4 */
  385. psubw xmm4, xmm5 /* -=x5 */
  386. pand xmm4, xmm7 /* and flag */
  387. paddw xmm4, xmm5 /* += x5 */
  388. lea esi, [esi+ecx*4] /* esi=des + 2*pitch */
  389. packuswb xmm4, xmm0 /* pack to bytes */
  390. movq QWORD PTR [esi+edx*2], xmm4 /* write new x5 */
  391. /* sum += p2 - x2 */
  392. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  393. movdqa xmm5, [edi+96] /* xmm5 = x6 */
  394. psubw xmm3, [edi+32] /* -= x2 */
  395. paddw xmm3, xmm2 /* += p2 */
  396. movdqa xmm4, xmm5 /* xmm4 = x6 */
  397. paddw xmm4, xmm3 /* xmm4 = sum+x6 */
  398. paddw xmm4, xmm4 /* xmm4 *= 2*/
  399. paddw xmm4, [edi+32] /* +=x2 */
  400. psubw xmm4, [edi+48] /* -=x3 */
  401. psraw xmm4, 4 /* >>=4 */
  402. psubw xmm4, xmm5 /* -=x6 */
  403. pand xmm4, xmm7 /* and flag */
  404. paddw xmm4, xmm5 /* += x6 */
  405. packuswb xmm4, xmm0 /* pack to bytes */
  406. movq QWORD PTR [esi+edx], xmm4 /* write new x6 */
  407. /* sum += p2 - x3 */
  408. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  409. movdqa xmm5, [edi+112] /* xmm5 = x7 */
  410. psubw xmm3, [edi+48] /* -= x3 */
  411. paddw xmm3, xmm2 /* += p2 */
  412. movdqa xmm4, xmm5 /* xmm4 = x7 */
  413. paddw xmm4, xmm3 /* xmm4 = sum+x7 */
  414. paddw xmm4, xmm4 /* xmm4 *= 2*/
  415. paddw xmm4, [edi+48] /* +=x3 */
  416. psubw xmm4, [edi+64] /* -=x4 */
  417. psraw xmm4, 4 /* >>=4 */
  418. psubw xmm4, xmm5 /* -=x7 */
  419. pand xmm4, xmm7 /* and flag */
  420. paddw xmm4, xmm5 /* += x7 */
  421. packuswb xmm4, xmm0 /* pack to bytes */
  422. movq QWORD PTR [esi],xmm4 /* write new x7 */
  423. /* sum += p2 - x4 */
  424. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  425. movdqa xmm5, [edi+128] /* xmm5 = x8 */
  426. psubw xmm3, [edi+64] /* -= x4 */
  427. paddw xmm3, xmm2 /* += p2 */
  428. movdqa xmm4, xmm5 /* xmm4 = x8 */
  429. paddw xmm4, xmm3 /* xmm4 = sum+x8 */
  430. paddw xmm4, xmm4 /* xmm4 *= 2*/
  431. paddw xmm4, [edi+64] /* +=x4 */
  432. psubw xmm4, [edi+80] /* -=x5 */
  433. psraw xmm4, 4 /* >>=4 */
  434. psubw xmm4, xmm5 /* -=x8 */
  435. pand xmm4, xmm7 /* and flag */
  436. paddw xmm4, xmm5 /* += x8 */
  437. packuswb xmm4, xmm0 /* pack to bytes */
  438. movq QWORD PTR [esi+ecx], xmm4 /* write new x8 */
  439. pop edi
  440. pop esi
  441. pop edx
  442. pop ecx
  443. pop eax
  444. } /* end of the macro */
  445. Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
  446. Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
  447. pbi->FragmentVariances[CurrentFrag] += Var1;
  448. pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
  449. }
  450. else
  451. {
  452. /* copy from src to des */
  453. __asm
  454. {
  455. push esi
  456. push edi
  457. push ecx
  458. mov esi, Src /* esi = Src */
  459. mov edi, Des /* edi = Des */
  460. push edx
  461. mov ecx, PlaneLineStep /* ecx = Pitch */
  462. xor edx, edx /* clear edx */
  463. sub edx, ecx /* edx = -Pitch */
  464. lea esi, [esi+edx*4] /* esi=Src-4*Pitch*/
  465. movq mm0, [esi] /* first row */
  466. movq [edi+edx*4], mm0 /* write first row */
  467. lea edi, [edi+edx*4] /* edi=Des-4*Pitch*/
  468. movq mm1, [esi+ecx] /* Src-3*Pitch */
  469. movq [edi+ecx], mm1 /* write second row */
  470. movq mm2, [esi+ecx*2] /* Src-2*Pitch */
  471. lea esi, [esi+ecx*4] /* Src */
  472. movq [edi+ecx*2], mm2 /* write third row */
  473. lea edi, [edi+ecx*4] /* Des */
  474. movq mm3, [esi+edx] /* Src-Pitch */
  475. movq [edi+edx], mm3 /* write fourth row */
  476. movq mm4, [esi] /* Src */
  477. movq mm5, [esi+ecx] /* Src+Pitch */
  478. movq [edi], mm4 /* write fifth rwo */
  479. movq mm6, [esi+ecx*2]
  480. lea esi, [esi+ecx*4] /* Src+pitch*4 */
  481. movq [edi+ecx], mm5 /* write the sixth rwo */
  482. movq [edi+ecx*2], mm6 /* write the seventh row */
  483. movq mm7, [esi+edx]
  484. lea edi, [edi+ecx*4] /* Des+Pitch*4 */
  485. movq [edi+edx], mm7 /* write the last row */
  486. pop edx
  487. pop ecx
  488. pop edi
  489. pop esi
  490. }
  491. }
  492. Src += 8;
  493. Des += 8;
  494. CurrentFrag ++;
  495. }
  496. Des -= ((PlaneLineStep + FragAcross)<<3);
  497. Des += 8;
  498. Src = Des;
  499. CurrentFrag = StartFrag ;
  500. while(CurrentFrag < StartFrag + FragAcross - 1)
  501. {
  502. QStep = QuantScale[pbi->FragQIndex[CurrentFrag+1]];
  503. if( QStep > 3 )
  504. {
  505. QStepWMT[0] = (INT16)QStep;
  506. QStepWMT[1] = (INT16)QStep;
  507. QStepWMT[2] = (INT16)QStep;
  508. QStepWMT[3] = (INT16)QStep;
  509. QStepWMT[4] = (INT16)QStep;
  510. QStepWMT[5] = (INT16)QStep;
  511. QStepWMT[6] = (INT16)QStep;
  512. QStepWMT[7] = (INT16)QStep;
  513. for( j=0; j<8;j++)
  514. {
  515. Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
  516. Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
  517. }
  518. __asm
  519. {
  520. /* Save the registers */
  521. push eax
  522. push ecx
  523. push edx
  524. push esi
  525. push edi
  526. /* Calculate the FLimit and store FLimit and QStep */
  527. movdqa xmm0, QStepWMT /* Get QStep */
  528. movdqa xmm1, EightThrees /* mm1 = 03030303 */
  529. pmullw xmm1, xmm0 /* mm1 = QStep * 3 */
  530. pmullw xmm1, xmm0 /* mm1 = QStep * QStep * 3 */
  531. psrlw xmm1, 5 /* mm1 = FLimit */
  532. movdqa [FLimitWMT], xmm1 /* Save FLimit */
  533. /* setup the pointers to data */
  534. mov eax, Src /* eax = Src */
  535. xor edx, edx /* clear edx */
  536. mov esi, Des /* esi = Des */
  537. sub eax, 4 /* eax = Src-4 */
  538. sub esi, 4 /* esi = Des-4 */
  539. lea edi, Rows /* edi = Rows */
  540. mov ecx, PlaneLineStep /* ecx = Pitch */
  541. sub edx, ecx /* edx = -Pitch */
  542. lea esi, [esi+ecx*2] /* esi = Des-4 + 2 * Pitch */
  543. /* Get the data to the intermediate buffer */
  544. movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
  545. movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
  546. movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
  547. lea eax, [eax+ecx*4] /* Go down four Rows */
  548. movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
  549. movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
  550. punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
  551. punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
  552. movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
  553. punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
  554. punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
  555. movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
  556. punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
  557. punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
  558. movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
  559. punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
  560. punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
  561. pxor mm7, mm7 /* clear mm7 */
  562. movq mm5, mm0 /* make a copy */
  563. punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
  564. movq [edi+16], mm0 /* write 00 10 20 30 */
  565. punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
  566. movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
  567. movq [edi+32], mm5 /* write 01 11 21 31 */
  568. punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
  569. punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
  570. movq [edi+48], mm1 /* write 02 12 22 32 */
  571. movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
  572. movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
  573. movq [edi+64], mm0 /* write 03 13 23 33 */
  574. punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
  575. punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
  576. movq [edi+80], mm2 /* write 04 14 24 34 */
  577. punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
  578. punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
  579. movq [edi+96], mm3 /* write 05 15 25 35 */
  580. movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
  581. movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
  582. movq [edi+112], mm4 /* write 06 16 26 37 */
  583. movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
  584. lea eax, [eax+ ecx*4] /* Go down four rows */
  585. movq [edi+128], mm5 /* write 07 17 27 37 */
  586. movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
  587. movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
  588. punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
  589. punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
  590. movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
  591. punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
  592. punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
  593. movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
  594. punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
  595. punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
  596. movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
  597. punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
  598. punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
  599. movq mm5, mm0 /* make a copy */
  600. punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
  601. movq [edi+24], mm0 /* write 40 50 60 70 */
  602. punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
  603. movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
  604. movq [edi+40], mm5 /* write 41 51 61 71 */
  605. punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
  606. punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
  607. movq [edi+56], mm1 /* write 42 52 62 72 */
  608. movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
  609. movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
  610. movq [edi+72], mm0 /* write 43 53 63 73 */
  611. punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
  612. punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
  613. movq [edi+88], mm2 /* write 44 54 64 74 */
  614. punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
  615. punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
  616. movq [edi+104], mm3 /* write 45 55 65 75 */
  617. movq [edi+120], mm4 /* write 46 56 66 76 */
  618. movq [edi+136], mm5 /* write 47 57 67 77 */
  619. /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
  620. /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
  621. pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
  622. psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
  623. psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
  624. movdqa xmm2, [edi+16] /* Pixel 1 */
  625. movdqa xmm6, [edi+80] /* Pixel 5 */
  626. psubw xmm2, xmm3 /* xmm2 -=128 */
  627. psubw xmm6, xmm3 /* xmm6 -=128 */
  628. movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
  629. movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
  630. pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
  631. pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
  632. movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
  633. movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
  634. movdqa xmm2, [edi+32] /* Pixel 2 */
  635. movdqa xmm6, [edi+96] /* Pixel 6 */
  636. psubw xmm2, xmm3 /* xmm2 -=128 */
  637. psubw xmm6, xmm3 /* xmm6 -=128 */
  638. paddw xmm0, xmm2 /* xmm0 += pixel 2 */
  639. paddw xmm4, xmm6 /* xmm4 += pixel 6 */
  640. pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
  641. pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
  642. paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
  643. paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
  644. movdqa xmm2, [edi+48] /* Pixel 3 */
  645. movdqa xmm6, [edi+112] /* Pixel 7 */
  646. psubw xmm2, xmm3 /* xmm2 -=128 */
  647. psubw xmm6, xmm3 /* xmm6 -=128 */
  648. paddw xmm0, xmm2 /* xmm0 += pixel 3 */
  649. paddw xmm4, xmm6 /* xmm4 += pixel 7 */
  650. pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
  651. pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
  652. paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
  653. paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
  654. movdqa xmm2, [edi+64] /* Pixel 4 */
  655. movdqa xmm6, [edi+128] /* Pixel 8 */
  656. psubw xmm2, xmm3 /* xmm2 -=128 */
  657. psubw xmm6, xmm3 /* xmm6 -=128 */
  658. paddw xmm0, xmm2 /* xmm0 += pixel 4 */
  659. paddw xmm4, xmm6 /* xmm4 += pixel 8 */
  660. pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
  661. pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
  662. paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
  663. paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
  664. /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  665. /* xmm1 = x1 + x2 + x3 + x4 */
  666. /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  667. /* xmm5 = x5 + x6 + x7 + x8 */
  668. movdqa xmm7, xmm3 /* xmm7 = xmm3 */
  669. psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
  670. movdqa xmm2, xmm0 /* make copy of sum1 */
  671. movdqa xmm6, xmm4 /* make copy of sum2 */
  672. paddw xmm0, xmm7 /* (sum1 + 1) */
  673. paddw xmm4, xmm7 /* (sum2 + 1) */
  674. psraw xmm2, 1 /* sum1 /2 */
  675. psraw xmm6, 1 /* sum2 /2 */
  676. psraw xmm0, 1 /* (sum1 + 1)/2 */
  677. psraw xmm4, 1 /* (sum2 + 1)/2 */
  678. pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
  679. pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
  680. psubw xmm1, xmm2 /* Variance 1 */
  681. psubw xmm5, xmm6 /* Variance 2 */
  682. movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
  683. movdqa xmm2, xmm1 /* copy of Varinace 1*/
  684. movdqa [Variance1], xmm1 /* save the varinace1 */
  685. movdqa [Variance2], xmm5 /* save the varinace2 */
  686. movdqa xmm6, xmm5 /* Variance 2 */
  687. psubw xmm1, xmm7 /* Variance 1 < Flimit? */
  688. psubw xmm5, xmm7 /* Variance 2 < Flimit? */
  689. psraw xmm2, 15 /* Variance 1 > 32768? */
  690. psraw xmm6, 15 /* Vaiance 2 > 32768? */
  691. psraw xmm1, 15 /* FFFF/0000 for true/false */
  692. psraw xmm5, 15 /* FFFF/0000 for true/false */
  693. movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
  694. pandn xmm2, xmm1 /* Variance1<32678 &&
  695. Variance1<Limit */
  696. pandn xmm6, xmm5 /* Variance2<32678 &&
  697. Variance1<Limit */
  698. movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
  699. pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
  700. /* &&Variance2 < Flimit */
  701. movdqa xmm2, xmm7 /* make copy of Pixel4 */
  702. psubusw xmm7, xmm4 /* 4 - 5 */
  703. psubusw xmm4, xmm2 /* 5 - 4 */
  704. por xmm7, xmm4 /* abs(4 - 5) */
  705. psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
  706. psraw xmm7, 15 /* FFFF/0000 for True/Flase */
  707. pand xmm7, xmm6
  708. /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  709. /* xmm7 now are in use */
  710. /* Let's do the filtering now */
  711. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  712. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  713. movdqa xmm5, [edi] /* xmm5 = -5 */
  714. movdqa xmm4, [edi + 16] /* xmm4 = -4 */
  715. movdqa xmm3, xmm4 /* copy of -4 */
  716. movdqa xmm6, xmm5 /* copy of -5 */
  717. psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
  718. psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
  719. por xmm4, xmm5 /* abs([-4]-[-5] ) */
  720. psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
  721. psraw xmm4, 15 /* FFFF/0000 for True/False */
  722. movdqa xmm1, xmm4 /* copy of the xmm4 */
  723. pand xmm4, xmm6 /* */
  724. pandn xmm1, xmm3 /* */
  725. por xmm1, xmm4 /* xmm1 = p1 */
  726. /* now find P2 */
  727. movdqa xmm4, [edi+128] /* xmm4 = [3] */
  728. movdqa xmm5, [edi+144] /* xmm5 = [4] */
  729. movdqa xmm3, xmm4 /* copy of 3 */
  730. movdqa xmm6, xmm5 /* copy of 4 */
  731. psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
  732. psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
  733. por xmm4, xmm5 /* abs([3]-[4] ) */
  734. psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
  735. psraw xmm4, 15 /* FFFF/0000 for True/False */
  736. movdqa xmm2, xmm4 /* copy of the xmm4 */
  737. pand xmm4, xmm6 /* */
  738. pandn xmm2, xmm3 /* */
  739. por xmm2, xmm4 /* xmm2 = p2 */
  740. /* Data is ready, now do the filtering */
  741. pxor xmm0, xmm0 /* clear xmm0 */
  742. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  743. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  744. /* Des[-w4] = Src[-w4]; */
  745. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  746. movdqa xmm3, xmm1 /* xmm3 = p1 */
  747. paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
  748. paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
  749. movdqa xmm4, [edi+16] /* xmm4 = x1 */
  750. paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
  751. paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
  752. paddw xmm3, [edi+64] /* xmm3 += x4 */
  753. paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
  754. paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
  755. movdqa xmm4, xmm3 /* xmm4 = xmm3 */
  756. movdqa xmm5, [edi+16] /* xmm5 = x1 */
  757. paddw xmm4, xmm5 /* xmm4 = sum+x1 */
  758. psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
  759. psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
  760. paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
  761. psraw xmm4, 4 /* xmm4 >>=4 */
  762. psubw xmm4, xmm5 /* New Value - old Value */
  763. pand xmm4, xmm7 /* And the flag */
  764. paddw xmm4, xmm5 /* add the old value back */
  765. packuswb xmm4, xmm0 /* pack it to bytes */
  766. movdq2q mm0, xmm4 /* Write new x1 */
  767. /* sum += x5 -p1 */
  768. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  769. movdqa xmm5, [edi+32] /* xmm5= x2 */
  770. psubw xmm3, xmm1 /* sum=sum-p1 */
  771. paddw xmm3, [edi+80] /* sum=sum+x5 */
  772. movdqa xmm4, xmm5 /* copy sum */
  773. paddw xmm4, xmm3 /* xmm4=sum+x2 */
  774. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  775. psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
  776. paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
  777. psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
  778. psubw xmm4, xmm5 /* new value - old value */
  779. pand xmm4, xmm7 /* And the flag */
  780. paddw xmm4, xmm5 /* add the old value back */
  781. packuswb xmm4, xmm0 /* pack it to bytes */
  782. movdq2q mm1, xmm4 /* write new x2 */
  783. /* sum += x6 - p1 */
  784. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  785. movdqa xmm5, [edi+48] /* xmm5= x3 */
  786. psubw xmm3, xmm1 /* sum=sum-p1 */
  787. paddw xmm3, [edi+96] /* sum=sum+x6 */
  788. movdqa xmm4, xmm5 /* copy x3 */
  789. paddw xmm4, xmm3 /* xmm4=sum+x3 */
  790. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  791. psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
  792. paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
  793. psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
  794. psubw xmm4, xmm5 /* new value - old value */
  795. pand xmm4, xmm7 /* And the flag */
  796. paddw xmm4, xmm5 /* add the old value back */
  797. packuswb xmm4, xmm0 /* pack it to bytes */
  798. movdq2q mm2, xmm4 /* write new x3 */
  799. /* sum += x7 - p1 */
  800. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  801. movdqa xmm5, [edi+64] /* xmm5 = x4 */
  802. psubw xmm3, xmm1 /* sum = sum-p1 */
  803. paddw xmm3, [edi+112] /* sum = sum+x7 */
  804. movdqa xmm4, xmm5 /* xmm4 = x4 */
  805. paddw xmm4, xmm3 /* xmm4 = sum + x4 */
  806. paddw xmm4, xmm4 /* xmm4 *=2 */
  807. paddw xmm4, xmm1 /* += p1 */
  808. psubw xmm4, [edi+16] /* -= x1 */
  809. psubw xmm4, [edi+112] /* -= x7 */
  810. paddw xmm4, [edi+128] /* += x8 */
  811. psraw xmm4, 4 /* >>=4 */
  812. psubw xmm4, xmm5 /* -=x4 */
  813. pand xmm4, xmm7 /* and flag */
  814. paddw xmm4, xmm5 /* += x4 */
  815. packuswb xmm4, xmm0 /* pack it to bytes */
  816. movdq2q mm3, xmm4 /* write new x4 */
  817. /* sum+= x8-x1 */
  818. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  819. movdqa xmm5, [edi+80] /* xmm5 = x5 */
  820. psubw xmm3, [edi+16] /* sum -= x1 */
  821. paddw xmm3, [edi+128] /* sub += x8 */
  822. movdqa xmm4, xmm5 /* xmm4 = x5 */
  823. paddw xmm4, xmm3 /* xmm4= sum+x5 */
  824. paddw xmm4, xmm4 /* xmm4 *= 2 */
  825. paddw xmm4, [edi+16] /* += x1 */
  826. psubw xmm4, [edi+32] /* -= x2 */
  827. psubw xmm4, [edi+128] /* -= x8 */
  828. paddw xmm4, xmm2 /* += p2 */
  829. psraw xmm4, 4 /* >>=4 */
  830. psubw xmm4, xmm5 /* -=x5 */
  831. pand xmm4, xmm7 /* and flag */
  832. paddw xmm4, xmm5 /* += x5 */
  833. packuswb xmm4, xmm0 /* pack to bytes */
  834. movdq2q mm4, xmm4 /* write new x5 */
  835. /* sum += p2 - x2 */
  836. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  837. movdqa xmm5, [edi+96] /* xmm5 = x6 */
  838. psubw xmm3, [edi+32] /* -= x2 */
  839. paddw xmm3, xmm2 /* += p2 */
  840. movdqa xmm4, xmm5 /* xmm4 = x6 */
  841. paddw xmm4, xmm3 /* xmm4 = sum+x6 */
  842. paddw xmm4, xmm4 /* xmm4 *= 2*/
  843. paddw xmm4, [edi+32] /* +=x2 */
  844. psubw xmm4, [edi+48] /* -=x3 */
  845. psraw xmm4, 4 /* >>=4 */
  846. psubw xmm4, xmm5 /* -=x6 */
  847. pand xmm4, xmm7 /* and flag */
  848. paddw xmm4, xmm5 /* += x6 */
  849. packuswb xmm4, xmm0 /* pack to bytes */
  850. movdq2q mm5, xmm4 /* write new x6 */
  851. /* sum += p2 - x3 */
  852. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  853. movdqa xmm5, [edi+112] /* xmm5 = x7 */
  854. psubw xmm3, [edi+48] /* -= x3 */
  855. paddw xmm3, xmm2 /* += p2 */
  856. movdqa xmm4, xmm5 /* xmm4 = x7 */
  857. paddw xmm4, xmm3 /* xmm4 = sum+x7 */
  858. paddw xmm4, xmm4 /* xmm4 *= 2*/
  859. paddw xmm4, [edi+48] /* +=x3 */
  860. psubw xmm4, [edi+64] /* -=x4 */
  861. psraw xmm4, 4 /* >>=4 */
  862. psubw xmm4, xmm5 /* -=x7 */
  863. pand xmm4, xmm7 /* and flag */
  864. paddw xmm4, xmm5 /* += x7 */
  865. packuswb xmm4, xmm0 /* pack to bytes */
  866. movdq2q mm6, xmm4 /* write new x7 */
  867. /* sum += p2 - x4 */
  868. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  869. movdqa xmm5, [edi+128] /* xmm5 = x8 */
  870. psubw xmm3, [edi+64] /* -= x4 */
  871. paddw xmm3, xmm2 /* += p2 */
  872. movdqa xmm4, xmm5 /* xmm4 = x8 */
  873. paddw xmm4, xmm3 /* xmm4 = sum+x8 */
  874. paddw xmm4, xmm4 /* xmm4 *= 2*/
  875. paddw xmm4, [edi+64] /* +=x4 */
  876. psubw xmm4, [edi+80] /* -=x5 */
  877. psraw xmm4, 4 /* >>=4 */
  878. psubw xmm4, xmm5 /* -=x8 */
  879. pand xmm4, xmm7 /* and flag */
  880. paddw xmm4, xmm5 /* += x8 */
  881. packuswb xmm4, xmm0 /* pack to bytes */
  882. movdq2q mm7, xmm4 /* write new x8 */
  883. /* transpose */
  884. movq2dq xmm0, mm0 /* xmm0 = 70 60 50 40 30 20 10 00 */
  885. movq2dq xmm1, mm1 /* xmm1 = 71 61 51 41 31 21 11 01 */
  886. movq2dq xmm2, mm2 /* xmm2 = 72 62 52 42 32 22 12 02 */
  887. movq2dq xmm3, mm3 /* xmm3 = 73 63 53 43 33 23 13 03 */
  888. punpcklbw xmm0, xmm1 /* xmm0 = 7170 6160 5150 4140 3130 2120 1110 0100 */
  889. punpcklbw xmm2, xmm3 /* xmm2 = 7372 6362 5352 4342 3332 2322 1312 0302 */
  890. movdqa xmm1, xmm0 /* xmm1 = 7170 6160 5150 4140 3130 2120 1110 0100 */
  891. punpcklwd xmm0, xmm2 /* xmm0 = 33323130 23222120 13121110 03020100 */
  892. punpckhwd xmm1, xmm2 /* xmm1 = 73727170 63626160 53525150 43424140 */
  893. movq2dq xmm4, mm4 /* xmm4 = 74 64 54 44 34 24 14 04 */
  894. movq2dq xmm5, mm5 /* xmm5 = 75 65 55 45 35 25 15 05 */
  895. movq2dq xmm6, mm6 /* xmm6 = 76 66 56 46 36 26 16 06 */
  896. movq2dq xmm7, mm7 /* xmm7 = 77 67 57 47 37 27 17 07 */
  897. punpcklbw xmm4, xmm5 /* xmm4 = 7574 6564 5554 4544 3534 2524 1514 0504 */
  898. punpcklbw xmm6, xmm7 /* xmm6 = 7776 6766 5756 4746 3736 2726 1716 0706 */
  899. movdqa xmm5, xmm4 /* xmm5 = 7574 6564 5554 4544 3534 2524 1514 0504 */
  900. punpcklwd xmm4, xmm6 /* xmm4 = 37363534 27262524 17161514 07060504 */
  901. punpckhwd xmm5, xmm6 /* xmm5 = 77767574 67666564 57565554 47464544 */
  902. movdqa xmm2, xmm0 /* xmm2 = 33323130 23222120 13121110 03020100 */
  903. punpckldq xmm0, xmm4 /* xmm0 = 1716151413121110 0706050403020100 */
  904. movq QWORD PTR [esi+edx*2],xmm0 /* write 00 01 02 03 04 05 06 07 */
  905. psrldq xmm0, 8 /* xmm0 = 1716151413121110 */
  906. punpckhdq xmm2, xmm4 /* xmm2 = 3736353433323130 2726252423222120 */
  907. movq QWORD PTR [esi+edx], xmm0 /* write 10 11 12 13 14 15 16 17 */
  908. movdqa xmm3, xmm1 /* xmm3 = 73727170 63626160 53525150 43424140 */
  909. punpckldq xmm1, xmm5 /* xmm1 = 5756555453525150 4746454443424140 */
  910. movq QWORD PTR [esi], xmm2 /* write 20 21 22 23 24 25 26 27 */
  911. psrldq xmm2, 8 /* xmm2 = 3736353433323130 */
  912. punpckhdq xmm3, xmm5 /* xmm3 = 7776757473727170 6766656463626160 */
  913. movq QWORD PTR [esi+ecx], xmm2 /* write 30 31 32 33 34 35 36 37 */
  914. lea esi, [esi+ecx*4] /* esi= Des - 4 + 4 *pitch */
  915. movq QWORD PTR [esi+edx*2], xmm1 /* write 40 41 42 43 44 45 46 47 */
  916. movq QWORD PTR [esi], xmm3 /* write 60 61 62 63 64 65 66 67 */
  917. psrldq xmm1, 8 /* xmm1 = 5756555453525150 */
  918. psrldq xmm3, 8 /* xmm3 = 7776757473727170 */
  919. movq QWORD PTR [esi+edx], xmm1 /* write 50 51 52 53 54 55 56 57 */
  920. movq QWORD PTR [esi+ecx], xmm3 /* write 70 71 72 73 74 75 76 77 */
  921. pop edi
  922. pop esi
  923. pop edx
  924. pop ecx
  925. pop eax
  926. }// end of __asm
  927. Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
  928. Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
  929. pbi->FragmentVariances[CurrentFrag] += Var1;
  930. pbi->FragmentVariances[CurrentFrag + 1] += Var2;
  931. }// end of if
  932. CurrentFrag ++;
  933. Src += 8;
  934. Des += 8;
  935. }//end of while
  936. #endif
  937. }
  938. /****************************************************************************
  939. *
  940. * ROUTINE : DeblockNonFilteredBand_WMT
  941. *
  942. * INPUTS : None
  943. *
  944. * OUTPUTS : None
  945. *
  946. * RETURNS : None
  947. *
  948. * FUNCTION : Filter both horizontal and vertical edge in a band
  949. *
  950. * SPECIAL NOTES :
  951. *
  952. * REFERENCE :
  953. *
  954. * ERRORS : None.
  955. *
  956. ****************************************************************************/
  957. void DeblockNonFilteredBand_WMT(
  958. POSTPROC_INSTANCE *pbi,
  959. UINT8 *SrcPtr,
  960. UINT8 *DesPtr,
  961. UINT32 PlaneLineStep,
  962. UINT32 FragAcross,
  963. UINT32 StartFrag,
  964. UINT32 *QuantScale
  965. )
  966. {
  967. UINT32 j;
  968. UINT32 CurrentFrag=StartFrag;
  969. UINT32 QStep;
  970. UINT32 LoopFLimit;
  971. UINT8 *Src, *Des;
  972. UINT32 Var1, Var2;
  973. #if defined(_WIN32_WCE)
  974. return;
  975. #else
  976. __declspec(align(16)) short QStepWMT[8];
  977. __declspec(align(16)) short FLimitWMT[8];
  978. __declspec(align(16)) short Rows[80];
  979. __declspec(align(16)) short LoopFLimitWMT[8];
  980. __declspec(align(16)) short LoopFilteredValuesUp[8];
  981. __declspec(align(16)) short LoopFilteredValuesDown[8];
  982. __declspec(align(16)) unsigned short Variance1[8];
  983. __declspec(align(16)) unsigned short Variance2[8];
  984. LoopFLimit = DeblockLimitValuesV2[pbi->FrameQIndex];
  985. LoopFLimitWMT[0] = (INT16)LoopFLimit;
  986. LoopFLimitWMT[1] = (INT16)LoopFLimit;
  987. LoopFLimitWMT[2] = (INT16)LoopFLimit;
  988. LoopFLimitWMT[3] = (INT16)LoopFLimit;
  989. LoopFLimitWMT[4] = (INT16)LoopFLimit;
  990. LoopFLimitWMT[5] = (INT16)LoopFLimit;
  991. LoopFLimitWMT[6] = (INT16)LoopFLimit;
  992. LoopFLimitWMT[7] = (INT16)LoopFLimit;
  993. while(CurrentFrag < StartFrag + FragAcross )
  994. {
  995. Src=SrcPtr+8*(CurrentFrag-StartFrag);
  996. Des=DesPtr+8*(CurrentFrag-StartFrag);
  997. QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
  998. __asm
  999. {
  1000. push eax
  1001. push ecx
  1002. push edx
  1003. push esi
  1004. push edi
  1005. /* Calculate the FLimit and store FLimit and QStep */
  1006. /* Copy the data to the intermediate buffer */
  1007. mov eax, QStep
  1008. xor edx, edx /* clear edx */
  1009. mov ecx, PlaneLineStep /* ecx = Pitch */
  1010. pcmpeqw xmm6, xmm6 /* xmm6 = FFFFFF... */
  1011. movd mm5, eax /* mm5 = QStep */
  1012. psrlw xmm6, 14 /* xmm6 = 3, 3, 3, 3, 3, 3, 3, 3*/
  1013. punpcklwd mm5, mm5 /* mm5 = QQ */
  1014. mov eax, Src /* eax = Src */
  1015. punpckldq mm5, mm5 /* mm5 = QQQQ */
  1016. sub edx, ecx /* edx = - Pitch */
  1017. movq2dq xmm5, mm5 /* xmm5 = QQQQ */
  1018. punpcklqdq xmm5, xmm5 /* xmm5 = QQQQQQQQ */
  1019. pmullw xmm6, xmm5 /* Qstep * 3 */
  1020. movdqa QStepWMT, xmm5
  1021. lea edi, Rows /* edi = Rows */
  1022. pxor xmm7, xmm7 /* Clear mm7 */
  1023. mov esi, Des /* esi = des */
  1024. pmullw xmm6, xmm5
  1025. lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
  1026. lea esi, [esi + edx * 2] /* esi = Des - 2*Pitch */
  1027. psraw xmm6, 5
  1028. movdqa FLimitWMT, xmm6
  1029. /* Copy the data to the intermediate buffer */
  1030. movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-5*Pitch] */
  1031. movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[-4*Pitch */
  1032. punpcklbw xmm0, xmm7 /* expand to words */
  1033. punpcklbw xmm1, xmm7 /* expand to words */
  1034. movdqa [edi], xmm0 /* write 8 words */
  1035. movdqa [edi+16], xmm1 /* write 8 words */
  1036. movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[-3*Pitch] */
  1037. movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[-2*Pitch] */
  1038. punpcklbw xmm2, xmm7 /* expand to words */
  1039. punpcklbw xmm3, xmm7 /* expand to words */
  1040. movdqa [edi+32], xmm2 /* write 8 words */
  1041. movdqa [edi+48], xmm3 /* write 8 words */
  1042. lea eax, [eax+ecx*4] /* eax= Src */
  1043. movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-Pitch] */
  1044. movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[0] */
  1045. punpcklbw xmm0, xmm7 /* expand to words */
  1046. punpcklbw xmm1, xmm7 /* expand to words */
  1047. movdqa [edi+64], xmm0 /* write 8 words */
  1048. movdqa [edi+80], xmm1 /* write 8 words */
  1049. movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[Pitch] */
  1050. movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[2*Pitch] */
  1051. punpcklbw xmm2, xmm7 /* expand to words */
  1052. punpcklbw xmm3, xmm7 /* expand to words */
  1053. movdqa [edi+96], xmm2 /* write 8 words */
  1054. movdqa [edi+112], xmm3 /* write 8 words */
  1055. lea eax, [eax+ecx*4] /* eax= Src+4*Pitch */
  1056. movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[3*Pitch] */
  1057. movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[4*Pitch] */
  1058. punpcklbw xmm0, xmm7 /* expand to words */
  1059. punpcklbw xmm1, xmm7 /* expand to words */
  1060. movdqa [edi+128], xmm0 /* write 8 words */
  1061. movdqa [edi+144], xmm1 /* write 8 words */
  1062. /* done with copying everything to intermediate buffer */
  1063. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  1064. /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
  1065. /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
  1066. pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
  1067. psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
  1068. psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
  1069. movdqa xmm2, [edi+16] /* Pixel 1 */
  1070. movdqa xmm6, [edi+80] /* Pixel 5 */
  1071. psubw xmm2, xmm3 /* xmm2 -=128 */
  1072. psubw xmm6, xmm3 /* xmm6 -=128 */
  1073. movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
  1074. movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
  1075. pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
  1076. pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
  1077. movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
  1078. movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
  1079. movdqa xmm2, [edi+32] /* Pixel 2 */
  1080. movdqa xmm6, [edi+96] /* Pixel 6 */
  1081. psubw xmm2, xmm3 /* xmm2 -=128 */
  1082. psubw xmm6, xmm3 /* xmm6 -=128 */
  1083. paddw xmm0, xmm2 /* xmm0 += pixel 2 */
  1084. paddw xmm4, xmm6 /* xmm4 += pixel 6 */
  1085. pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
  1086. pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
  1087. paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
  1088. paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
  1089. movdqa xmm2, [edi+48] /* Pixel 3 */
  1090. movdqa xmm6, [edi+112] /* Pixel 7 */
  1091. psubw xmm2, xmm3 /* xmm2 -=128 */
  1092. psubw xmm6, xmm3 /* xmm6 -=128 */
  1093. paddw xmm0, xmm2 /* xmm0 += pixel 3 */
  1094. paddw xmm4, xmm6 /* xmm4 += pixel 7 */
  1095. pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
  1096. pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
  1097. paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
  1098. paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
  1099. movdqa xmm2, [edi+64] /* Pixel 4 */
  1100. movdqa xmm6, [edi+128] /* Pixel 8 */
  1101. psubw xmm2, xmm3 /* xmm2 -=128 */
  1102. psubw xmm6, xmm3 /* xmm6 -=128 */
  1103. paddw xmm0, xmm2 /* xmm0 += pixel 4 */
  1104. paddw xmm4, xmm6 /* xmm4 += pixel 8 */
  1105. pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
  1106. pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
  1107. paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
  1108. paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
  1109. /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  1110. /* xmm1 = x1 + x2 + x3 + x4 */
  1111. /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  1112. /* xmm5 = x5 + x6 + x7 + x8 */
  1113. movdqa xmm7, xmm3 /* xmm7 = xmm3 */
  1114. psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
  1115. movdqa xmm2, xmm0 /* make copy of sum1 */
  1116. movdqa xmm6, xmm4 /* make copy of sum2 */
  1117. paddw xmm0, xmm7 /* (sum1 + 1) */
  1118. paddw xmm4, xmm7 /* (sum2 + 1) */
  1119. psraw xmm2, 1 /* sum1 /2 */
  1120. psraw xmm6, 1 /* sum2 /2 */
  1121. psraw xmm0, 1 /* (sum1 + 1)/2 */
  1122. psraw xmm4, 1 /* (sum2 + 1)/2 */
  1123. pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
  1124. pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
  1125. psubw xmm1, xmm2 /* Variance 1 */
  1126. psubw xmm5, xmm6 /* Variance 2 */
  1127. movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
  1128. movdqa xmm2, xmm1 /* copy of Varinace 1*/
  1129. movdqa [Variance1], xmm1 /* save the varinace1 */
  1130. movdqa [Variance2], xmm5 /* save the varinace2 */
  1131. movdqa xmm6, xmm5 /* Variance 2 */
  1132. psubw xmm1, xmm7 /* Variance 1 < Flimit? */
  1133. psubw xmm5, xmm7 /* Variance 2 < Flimit? */
  1134. psraw xmm2, 15 /* Variance 1 > 32768? */
  1135. psraw xmm6, 15 /* Vaiance 2 > 32768? */
  1136. psraw xmm1, 15 /* FFFF/0000 for true/false */
  1137. psraw xmm5, 15 /* FFFF/0000 for true/false */
  1138. movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
  1139. pandn xmm2, xmm1 /* Variance1<32678 &&
  1140. Variance1<Limit */
  1141. pandn xmm6, xmm5 /* Variance2<32678 &&
  1142. Variance1<Limit */
  1143. movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
  1144. pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
  1145. /* &&Variance2 < Flimit */
  1146. movdqa xmm2, xmm7 /* make copy of Pixel4 */
  1147. psubusw xmm7, xmm4 /* 4 - 5 */
  1148. psubusw xmm4, xmm2 /* 5 - 4 */
  1149. por xmm7, xmm4 /* abs(4 - 5) */
  1150. psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
  1151. psraw xmm7, 15 /* FFFF/0000 for True/Flase */
  1152. pand xmm7, xmm6
  1153. /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1154. /* xmm7 now are in use */
  1155. /* find the loop filtered values for the pixels on block boundary */
  1156. movdqa xmm1, LoopFLimitWMT; /* Get the Flimit values for loop filter */
  1157. movdqa xmm3, [edi + 48] /* xmm3 = x3 = p[-2] */
  1158. movdqa xmm4, [edi + 64] /* mm4 = x4 = p[-1] */
  1159. movdqa xmm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  1160. movdqa xmm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  1161. psubw xmm5, xmm4 /* mm5 = p[ 0] - p[-1] */
  1162. psubw xmm3, xmm6 /* mm3 = p[-2] - p[ 1] */
  1163. movdqa xmm4, xmm5 /* make a copy */
  1164. paddw xmm4, xmm5 /* 2 * ( p[0] - p[-1] ) */
  1165. paddw xmm3, EightFours /* mm3 + 4 */
  1166. paddw xmm5, xmm4 /* 3 * ( p[0] - p[-1] ) */
  1167. paddw xmm3, xmm5 /* Filtval before shift */
  1168. psraw xmm3, 3 /* FiltVal */
  1169. movdqa xmm2, xmm3 /* make a copy */
  1170. psraw xmm3, 15 /* FFFF->Neg, 0000->Pos */
  1171. pxor xmm2, xmm3
  1172. psubsw xmm2, xmm3 /* mm2 = abs(FiltVal) */
  1173. por xmm3, EightOnes /* -1 and 1 for + and - */
  1174. movdqa xmm4, xmm1 /* make a copy of Flimit */
  1175. psubw xmm1, xmm2 /* mm1= Flimit - abs(FiltVal) */
  1176. movdqa xmm5, xmm1 /* copy Flimit - abs(FiltVal) */
  1177. psraw xmm1, 15 /* FFFF or 0000 */
  1178. pxor xmm5, xmm1
  1179. psubsw xmm5, xmm1 /* abs(Flimit - abs(FiltVal)) */
  1180. psubusw xmm4, xmm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  1181. pmullw xmm4, xmm3 /* get the sign back */
  1182. movdqa xmm1, [edi+64] /* p[-1] */
  1183. movdqa xmm2, [edi+80] /* p[0] */
  1184. paddw xmm1, mm4 /* p[-1] + NewFiltVal */
  1185. psubw xmm2, mm4 /* p[0] - NewFiltVal */
  1186. pxor xmm6, xmm6 /* clear mm6 */
  1187. packuswb xmm1, xmm1 /* clamping */
  1188. packuswb xmm2, xmm2
  1189. punpcklbw xmm1, xmm6 /* unpack to word */
  1190. movdqa LoopFilteredValuesUp, xmm1 /* save the values */
  1191. punpcklbw xmm2, xmm6 /* unpack to word */
  1192. movdqa LoopFilteredValuesDown, xmm2 /* save the values */
  1193. /* Let's do the filtering now */
  1194. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  1195. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  1196. movdqa xmm5, [edi] /* xmm5 = -5 */
  1197. movdqa xmm4, [edi + 16] /* xmm4 = -4 */
  1198. movdqa xmm3, xmm4 /* copy of -4 */
  1199. movdqa xmm6, xmm5 /* copy of -5 */
  1200. psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
  1201. psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
  1202. por xmm4, xmm5 /* abs([-4]-[-5] ) */
  1203. psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
  1204. psraw xmm4, 15 /* FFFF/0000 for True/False */
  1205. movdqa xmm1, xmm4 /* copy of the xmm4 */
  1206. pand xmm4, xmm6 /* */
  1207. pandn xmm1, xmm3 /* */
  1208. por xmm1, xmm4 /* xmm1 = p1 */
  1209. /* now find P2 */
  1210. movdqa xmm4, [edi+128] /* xmm4 = [3] */
  1211. movdqa xmm5, [edi+144] /* xmm5 = [4] */
  1212. movdqa xmm3, xmm4 /* copy of 3 */
  1213. movdqa xmm6, xmm5 /* copy of 4 */
  1214. psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
  1215. psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
  1216. por xmm4, xmm5 /* abs([3]-[4] ) */
  1217. psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
  1218. psraw xmm4, 15 /* FFFF/0000 for True/False */
  1219. movdqa xmm2, xmm4 /* copy of the xmm4 */
  1220. pand xmm4, xmm6 /* */
  1221. pandn xmm2, xmm3 /* */
  1222. por xmm2, xmm4 /* xmm2 = p2 */
  1223. /* Data is ready, now do the filtering */
  1224. pxor xmm0, xmm0 /* clear xmm0 */
  1225. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  1226. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  1227. /* Des[-w4] = Src[-w4]; */
  1228. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  1229. movdqa xmm3, xmm1 /* xmm3 = p1 */
  1230. paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
  1231. paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
  1232. movdqa xmm4, [edi+16] /* xmm4 = x1 */
  1233. paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
  1234. paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
  1235. paddw xmm3, [edi+64] /* xmm3 += x4 */
  1236. paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
  1237. paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
  1238. movdqa xmm4, xmm3 /* xmm4 = xmm3 */
  1239. movdqa xmm5, [edi+16] /* xmm5 = x1 */
  1240. paddw xmm4, xmm5 /* xmm4 = sum+x1 */
  1241. psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
  1242. psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
  1243. paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
  1244. psraw xmm4, 4 /* xmm4 >>=4 */
  1245. psubw xmm4, xmm5 /* New Value - old Value */
  1246. pand xmm4, xmm7 /* And the flag */
  1247. paddw xmm4, xmm5 /* add the old value back */
  1248. packuswb xmm4, xmm0 /* pack it to bytes */
  1249. movq QWORD PTR [esi+edx*2], xmm4 /* Write new x1 */
  1250. /* sum += x5 -p1 */
  1251. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  1252. movdqa xmm5, [edi+32] /* xmm5= x2 */
  1253. psubw xmm3, xmm1 /* sum=sum-p1 */
  1254. paddw xmm3, [edi+80] /* sum=sum+x5 */
  1255. movdqa xmm4, xmm5 /* copy sum */
  1256. paddw xmm4, xmm3 /* xmm4=sum+x2 */
  1257. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  1258. psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
  1259. paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
  1260. psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
  1261. psubw xmm4, xmm5 /* new value - old value */
  1262. pand xmm4, xmm7 /* And the flag */
  1263. paddw xmm4, xmm5 /* add the old value back */
  1264. packuswb xmm4, xmm0 /* pack it to bytes */
  1265. movq QWORD PTR [esi+edx], xmm4 /* write new x2 */
  1266. /* sum += x6 - p1 */
  1267. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  1268. movdqa xmm5, [edi+48] /* xmm5= x3 */
  1269. psubw xmm3, xmm1 /* sum=sum-p1 */
  1270. paddw xmm3, [edi+96] /* sum=sum+x6 */
  1271. movdqa xmm4, xmm5 /* copy x3 */
  1272. paddw xmm4, xmm3 /* xmm4=sum+x3 */
  1273. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  1274. psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
  1275. paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
  1276. psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
  1277. psubw xmm4, xmm5 /* new value - old value */
  1278. pand xmm4, xmm7 /* And the flag */
  1279. paddw xmm4, xmm5 /* add the old value back */
  1280. packuswb xmm4, xmm0 /* pack it to bytes */
  1281. movq QWORD PTR [esi],xmm4 /* write new x3 */
  1282. /* sum += x7 - p1 */
  1283. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  1284. movdqa xmm5, [edi+64] /* xmm5 = x4 */
  1285. psubw xmm3, xmm1 /* sum = sum-p1 */
  1286. paddw xmm3, [edi+112] /* sum = sum+x7 */
  1287. movdqa xmm4, xmm5 /* xmm4 = x4 */
  1288. paddw xmm4, xmm3 /* xmm4 = sum + x4 */
  1289. paddw xmm4, xmm4 /* xmm4 *=2 */
  1290. paddw xmm4, xmm1 /* += p1 */
  1291. psubw xmm4, [edi+16] /* -= x1 */
  1292. psubw xmm4, [edi+112] /* -= x7 */
  1293. paddw xmm4, [edi+128] /* += x8 */
  1294. movdqa xmm5, LoopFilteredValuesUp /* Read the loop filtered value of x4 */
  1295. psraw xmm4, 4 /* >>=4 */
  1296. psubw xmm4, xmm5 /* -=x4 */
  1297. pand xmm4, xmm7 /* and flag */
  1298. paddw xmm4, xmm5 /* += x4 */
  1299. packuswb xmm4, xmm0 /* pack it to bytes */
  1300. movq QWORD PTR [esi+ecx], xmm4 /* write new x4 */
  1301. /* sum+= x8-x1 */
  1302. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  1303. movdqa xmm5, [edi+80] /* xmm5 = x5 */
  1304. psubw xmm3, [edi+16] /* sum -= x1 */
  1305. paddw xmm3, [edi+128] /* sub += x8 */
  1306. movdqa xmm4, xmm5 /* xmm4 = x5 */
  1307. paddw xmm4, xmm3 /* xmm4= sum+x5 */
  1308. paddw xmm4, xmm4 /* xmm4 *= 2 */
  1309. paddw xmm4, [edi+16] /* += x1 */
  1310. psubw xmm4, [edi+32] /* -= x2 */
  1311. psubw xmm4, [edi+128] /* -= x8 */
  1312. paddw xmm4, xmm2 /* += p2 */
  1313. movdqa xmm5, LoopFilteredValuesDown /* Read the loop filtered value of x5 */
  1314. psraw xmm4, 4 /* >>=4 */
  1315. psubw xmm4, xmm5 /* -=x5 */
  1316. pand xmm4, xmm7 /* and flag */
  1317. paddw xmm4, xmm5 /* += x5 */
  1318. lea esi, [esi+ecx*4] /* esi=des + 2*pitch */
  1319. packuswb xmm4, xmm0 /* pack to bytes */
  1320. movq QWORD PTR [esi+edx*2], xmm4 /* write new x5 */
  1321. /* sum += p2 - x2 */
  1322. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  1323. movdqa xmm5, [edi+96] /* xmm5 = x6 */
  1324. psubw xmm3, [edi+32] /* -= x2 */
  1325. paddw xmm3, xmm2 /* += p2 */
  1326. movdqa xmm4, xmm5 /* xmm4 = x6 */
  1327. paddw xmm4, xmm3 /* xmm4 = sum+x6 */
  1328. paddw xmm4, xmm4 /* xmm4 *= 2*/
  1329. paddw xmm4, [edi+32] /* +=x2 */
  1330. psubw xmm4, [edi+48] /* -=x3 */
  1331. psraw xmm4, 4 /* >>=4 */
  1332. psubw xmm4, xmm5 /* -=x6 */
  1333. pand xmm4, xmm7 /* and flag */
  1334. paddw xmm4, xmm5 /* += x6 */
  1335. packuswb xmm4, xmm0 /* pack to bytes */
  1336. movq QWORD PTR [esi+edx], xmm4 /* write new x6 */
  1337. /* sum += p2 - x3 */
  1338. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  1339. movdqa xmm5, [edi+112] /* xmm5 = x7 */
  1340. psubw xmm3, [edi+48] /* -= x3 */
  1341. paddw xmm3, xmm2 /* += p2 */
  1342. movdqa xmm4, xmm5 /* xmm4 = x7 */
  1343. paddw xmm4, xmm3 /* xmm4 = sum+x7 */
  1344. paddw xmm4, xmm4 /* xmm4 *= 2*/
  1345. paddw xmm4, [edi+48] /* +=x3 */
  1346. psubw xmm4, [edi+64] /* -=x4 */
  1347. psraw xmm4, 4 /* >>=4 */
  1348. psubw xmm4, xmm5 /* -=x7 */
  1349. pand xmm4, xmm7 /* and flag */
  1350. paddw xmm4, xmm5 /* += x7 */
  1351. packuswb xmm4, xmm0 /* pack to bytes */
  1352. movq QWORD PTR [esi],xmm4 /* write new x7 */
  1353. /* sum += p2 - x4 */
  1354. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  1355. movdqa xmm5, [edi+128] /* xmm5 = x8 */
  1356. psubw xmm3, [edi+64] /* -= x4 */
  1357. paddw xmm3, xmm2 /* += p2 */
  1358. movdqa xmm4, xmm5 /* xmm4 = x8 */
  1359. paddw xmm4, xmm3 /* xmm4 = sum+x8 */
  1360. paddw xmm4, xmm4 /* xmm4 *= 2*/
  1361. paddw xmm4, [edi+64] /* +=x4 */
  1362. psubw xmm4, [edi+80] /* -=x5 */
  1363. psraw xmm4, 4 /* >>=4 */
  1364. psubw xmm4, xmm5 /* -=x8 */
  1365. pand xmm4, xmm7 /* and flag */
  1366. paddw xmm4, xmm5 /* += x8 */
  1367. packuswb xmm4, xmm0 /* pack to bytes */
  1368. movq QWORD PTR [esi+ecx], xmm4 /* write new x8 */
  1369. pop edi
  1370. pop esi
  1371. pop edx
  1372. pop ecx
  1373. pop eax
  1374. } /* end of the macro */
  1375. Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
  1376. Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
  1377. pbi->FragmentVariances[CurrentFrag] += Var1;
  1378. pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
  1379. if(CurrentFrag==StartFrag)
  1380. CurrentFrag++;
  1381. else
  1382. {
  1383. Des=DesPtr-8*PlaneLineStep+8*(CurrentFrag-StartFrag);
  1384. Src=Des;
  1385. QStep = QuantScale[pbi->FragQIndex[CurrentFrag]];
  1386. QStepWMT[0] = (INT16)QStep;
  1387. QStepWMT[1] = (INT16)QStep;
  1388. QStepWMT[2] = (INT16)QStep;
  1389. QStepWMT[3] = (INT16)QStep;
  1390. QStepWMT[4] = (INT16)QStep;
  1391. QStepWMT[5] = (INT16)QStep;
  1392. QStepWMT[6] = (INT16)QStep;
  1393. QStepWMT[7] = (INT16)QStep;
  1394. for( j=0; j<8;j++)
  1395. {
  1396. Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
  1397. Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
  1398. }
  1399. __asm
  1400. {
  1401. /* Save the registers */
  1402. push eax
  1403. push ecx
  1404. push edx
  1405. push esi
  1406. push edi
  1407. /* Calculate the FLimit and store FLimit and QStep */
  1408. movdqa xmm0, QStepWMT /* Get QStep */
  1409. movdqa xmm1, EightThrees /* mm1 = 03030303 */
  1410. pmullw xmm1, xmm0 /* mm1 = QStep * 3 */
  1411. pmullw xmm1, xmm0 /* mm1 = QStep * QStep * 3 */
  1412. psrlw xmm1, 5 /* mm1 = FLimit */
  1413. movdqa [FLimitWMT], xmm1 /* Save FLimit */
  1414. /* setup the pointers to data */
  1415. mov eax, Src /* eax = Src */
  1416. xor edx, edx /* clear edx */
  1417. mov esi, Des /* esi = Des */
  1418. sub eax, 4 /* eax = Src-4 */
  1419. sub esi, 4 /* esi = Des-4 */
  1420. lea edi, Rows /* edi = Rows */
  1421. mov ecx, PlaneLineStep /* ecx = Pitch */
  1422. sub edx, ecx /* edx = -Pitch */
  1423. lea esi, [esi+ecx*2] /* esi = Des-4 + 2 * Pitch */
  1424. /* Get the data to the intermediate buffer */
  1425. movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
  1426. movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
  1427. movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
  1428. lea eax, [eax+ecx*4] /* Go down four Rows */
  1429. movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
  1430. movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
  1431. punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
  1432. punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
  1433. movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
  1434. punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
  1435. punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
  1436. movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
  1437. punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
  1438. punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
  1439. movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
  1440. punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
  1441. punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
  1442. pxor mm7, mm7 /* clear mm7 */
  1443. movq mm5, mm0 /* make a copy */
  1444. punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
  1445. movq [edi+16], mm0 /* write 00 10 20 30 */
  1446. punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
  1447. movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
  1448. movq [edi+32], mm5 /* write 01 11 21 31 */
  1449. punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
  1450. punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
  1451. movq [edi+48], mm1 /* write 02 12 22 32 */
  1452. movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
  1453. movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
  1454. movq [edi+64], mm0 /* write 03 13 23 33 */
  1455. punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
  1456. punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
  1457. movq [edi+80], mm2 /* write 04 14 24 34 */
  1458. punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
  1459. punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
  1460. movq [edi+96], mm3 /* write 05 15 25 35 */
  1461. movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
  1462. movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
  1463. movq [edi+112], mm4 /* write 06 16 26 37 */
  1464. movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
  1465. lea eax, [eax+ ecx*4] /* Go down four rows */
  1466. movq [edi+128], mm5 /* write 07 17 27 37 */
  1467. movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
  1468. movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
  1469. punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
  1470. punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
  1471. movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
  1472. punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
  1473. punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
  1474. movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
  1475. punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
  1476. punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
  1477. movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
  1478. punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
  1479. punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
  1480. movq mm5, mm0 /* make a copy */
  1481. punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
  1482. movq [edi+24], mm0 /* write 40 50 60 70 */
  1483. punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
  1484. movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
  1485. movq [edi+40], mm5 /* write 41 51 61 71 */
  1486. punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
  1487. punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
  1488. movq [edi+56], mm1 /* write 42 52 62 72 */
  1489. movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
  1490. movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
  1491. movq [edi+72], mm0 /* write 43 53 63 73 */
  1492. punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
  1493. punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
  1494. movq [edi+88], mm2 /* write 44 54 64 74 */
  1495. punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
  1496. punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
  1497. movq [edi+104], mm3 /* write 45 55 65 75 */
  1498. movq [edi+120], mm4 /* write 46 56 66 76 */
  1499. movq [edi+136], mm5 /* write 47 57 67 77 */
  1500. /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
  1501. /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
  1502. pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
  1503. psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
  1504. psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
  1505. movdqa xmm2, [edi+16] /* Pixel 1 */
  1506. movdqa xmm6, [edi+80] /* Pixel 5 */
  1507. psubw xmm2, xmm3 /* xmm2 -=128 */
  1508. psubw xmm6, xmm3 /* xmm6 -=128 */
  1509. movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
  1510. movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
  1511. pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
  1512. pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
  1513. movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
  1514. movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
  1515. movdqa xmm2, [edi+32] /* Pixel 2 */
  1516. movdqa xmm6, [edi+96] /* Pixel 6 */
  1517. psubw xmm2, xmm3 /* xmm2 -=128 */
  1518. psubw xmm6, xmm3 /* xmm6 -=128 */
  1519. paddw xmm0, xmm2 /* xmm0 += pixel 2 */
  1520. paddw xmm4, xmm6 /* xmm4 += pixel 6 */
  1521. pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
  1522. pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
  1523. paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
  1524. paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
  1525. movdqa xmm2, [edi+48] /* Pixel 3 */
  1526. movdqa xmm6, [edi+112] /* Pixel 7 */
  1527. psubw xmm2, xmm3 /* xmm2 -=128 */
  1528. psubw xmm6, xmm3 /* xmm6 -=128 */
  1529. paddw xmm0, xmm2 /* xmm0 += pixel 3 */
  1530. paddw xmm4, xmm6 /* xmm4 += pixel 7 */
  1531. pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
  1532. pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
  1533. paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
  1534. paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
  1535. movdqa xmm2, [edi+64] /* Pixel 4 */
  1536. movdqa xmm6, [edi+128] /* Pixel 8 */
  1537. psubw xmm2, xmm3 /* xmm2 -=128 */
  1538. psubw xmm6, xmm3 /* xmm6 -=128 */
  1539. paddw xmm0, xmm2 /* xmm0 += pixel 4 */
  1540. paddw xmm4, xmm6 /* xmm4 += pixel 8 */
  1541. pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
  1542. pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
  1543. paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
  1544. paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
  1545. /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  1546. /* xmm1 = x1 + x2 + x3 + x4 */
  1547. /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  1548. /* xmm5 = x5 + x6 + x7 + x8 */
  1549. movdqa xmm7, xmm3 /* xmm7 = xmm3 */
  1550. psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
  1551. movdqa xmm2, xmm0 /* make copy of sum1 */
  1552. movdqa xmm6, xmm4 /* make copy of sum2 */
  1553. paddw xmm0, xmm7 /* (sum1 + 1) */
  1554. paddw xmm4, xmm7 /* (sum2 + 1) */
  1555. psraw xmm2, 1 /* sum1 /2 */
  1556. psraw xmm6, 1 /* sum2 /2 */
  1557. psraw xmm0, 1 /* (sum1 + 1)/2 */
  1558. psraw xmm4, 1 /* (sum2 + 1)/2 */
  1559. pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
  1560. pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
  1561. psubw xmm1, xmm2 /* Variance 1 */
  1562. psubw xmm5, xmm6 /* Variance 2 */
  1563. movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
  1564. movdqa xmm2, xmm1 /* copy of Varinace 1*/
  1565. movdqa [Variance1], xmm1 /* save the varinace1 */
  1566. movdqa [Variance2], xmm5 /* save the varinace2 */
  1567. movdqa xmm6, xmm5 /* Variance 2 */
  1568. psubw xmm1, xmm7 /* Variance 1 < Flimit? */
  1569. psubw xmm5, xmm7 /* Variance 2 < Flimit? */
  1570. psraw xmm2, 15 /* Variance 1 > 32768? */
  1571. psraw xmm6, 15 /* Vaiance 2 > 32768? */
  1572. psraw xmm1, 15 /* FFFF/0000 for true/false */
  1573. psraw xmm5, 15 /* FFFF/0000 for true/false */
  1574. movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
  1575. pandn xmm2, xmm1 /* Variance1<32678 &&
  1576. Variance1<Limit */
  1577. pandn xmm6, xmm5 /* Variance2<32678 &&
  1578. Variance1<Limit */
  1579. movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
  1580. pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
  1581. /* &&Variance2 < Flimit */
  1582. movdqa xmm2, xmm7 /* make copy of Pixel4 */
  1583. psubusw xmm7, xmm4 /* 4 - 5 */
  1584. psubusw xmm4, xmm2 /* 5 - 4 */
  1585. por xmm7, xmm4 /* abs(4 - 5) */
  1586. psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
  1587. psraw xmm7, 15 /* FFFF/0000 for True/Flase */
  1588. pand xmm7, xmm6
  1589. /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1590. /* xmm7 now are in use */
  1591. /* find the loop filtered values for the pixels on block boundary */
  1592. movdqa xmm1, LoopFLimitWMT; /* Get the Flimit values for loop filter */
  1593. movdqa xmm3, [edi + 48] /* xmm3 = x3 = p[-2] */
  1594. movdqa xmm4, [edi + 64] /* mm4 = x4 = p[-1] */
  1595. movdqa xmm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  1596. movdqa xmm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  1597. psubw xmm5, xmm4 /* mm5 = p[ 0] - p[-1] */
  1598. psubw xmm3, xmm6 /* mm3 = p[-2] - p[ 1] */
  1599. movdqa xmm4, xmm5 /* make a copy */
  1600. paddw xmm4, xmm5 /* 2 * ( p[0] - p[-1] ) */
  1601. paddw xmm3, EightFours /* mm3 + 4 */
  1602. paddw xmm5, xmm4 /* 3 * ( p[0] - p[-1] ) */
  1603. paddw xmm3, xmm5 /* Filtval before shift */
  1604. psraw xmm3, 3 /* FiltVal */
  1605. movdqa xmm2, xmm3 /* make a copy */
  1606. psraw xmm3, 15 /* FFFF->Neg, 0000->Pos */
  1607. pxor xmm2, xmm3
  1608. psubsw xmm2, xmm3 /* mm2 = abs(FiltVal) */
  1609. por xmm3, EightOnes /* -1 and 1 for + and - */
  1610. movdqa xmm4, xmm1 /* make a copy of Flimit */
  1611. psubw xmm1, xmm2 /* mm1= Flimit - abs(FiltVal) */
  1612. movdqa xmm5, xmm1 /* copy Flimit - abs(FiltVal) */
  1613. psraw xmm1, 15 /* FFFF or 0000 */
  1614. pxor xmm5, xmm1
  1615. psubsw xmm5, xmm1 /* abs(Flimit - abs(FiltVal)) */
  1616. psubusw xmm4, xmm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  1617. pmullw xmm4, xmm3 /* get the sign back */
  1618. movdqa xmm1, [edi+64] /* p[-1] */
  1619. movdqa xmm2, [edi+80] /* p[0] */
  1620. paddw xmm1, mm4 /* p[-1] + NewFiltVal */
  1621. psubw xmm2, mm4 /* p[0] - NewFiltVal */
  1622. pxor xmm6, xmm6 /* clear mm6 */
  1623. packuswb xmm1, xmm1 /* clamping */
  1624. packuswb xmm2, xmm2
  1625. punpcklbw xmm1, xmm6 /* unpack to word */
  1626. movdqa LoopFilteredValuesUp, xmm1 /* save the values */
  1627. punpcklbw xmm2, xmm6 /* unpack to word */
  1628. movdqa LoopFilteredValuesDown, xmm2 /* save the values */
  1629. /* Let's do the filtering now */
  1630. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  1631. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  1632. movdqa xmm5, [edi] /* xmm5 = -5 */
  1633. movdqa xmm4, [edi + 16] /* xmm4 = -4 */
  1634. movdqa xmm3, xmm4 /* copy of -4 */
  1635. movdqa xmm6, xmm5 /* copy of -5 */
  1636. psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
  1637. psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
  1638. por xmm4, xmm5 /* abs([-4]-[-5] ) */
  1639. psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
  1640. psraw xmm4, 15 /* FFFF/0000 for True/False */
  1641. movdqa xmm1, xmm4 /* copy of the xmm4 */
  1642. pand xmm4, xmm6 /* */
  1643. pandn xmm1, xmm3 /* */
  1644. por xmm1, xmm4 /* xmm1 = p1 */
  1645. /* now find P2 */
  1646. movdqa xmm4, [edi+128] /* xmm4 = [3] */
  1647. movdqa xmm5, [edi+144] /* xmm5 = [4] */
  1648. movdqa xmm3, xmm4 /* copy of 3 */
  1649. movdqa xmm6, xmm5 /* copy of 4 */
  1650. psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
  1651. psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
  1652. por xmm4, xmm5 /* abs([3]-[4] ) */
  1653. psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
  1654. psraw xmm4, 15 /* FFFF/0000 for True/False */
  1655. movdqa xmm2, xmm4 /* copy of the xmm4 */
  1656. pand xmm4, xmm6 /* */
  1657. pandn xmm2, xmm3 /* */
  1658. por xmm2, xmm4 /* xmm2 = p2 */
  1659. /* Data is ready, now do the filtering */
  1660. pxor xmm0, xmm0 /* clear xmm0 */
  1661. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  1662. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  1663. /* Des[-w4] = Src[-w4]; */
  1664. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  1665. movdqa xmm3, xmm1 /* xmm3 = p1 */
  1666. paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
  1667. paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
  1668. movdqa xmm4, [edi+16] /* xmm4 = x1 */
  1669. paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
  1670. paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
  1671. paddw xmm3, [edi+64] /* xmm3 += x4 */
  1672. paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
  1673. paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
  1674. movdqa xmm4, xmm3 /* xmm4 = xmm3 */
  1675. movdqa xmm5, [edi+16] /* xmm5 = x1 */
  1676. paddw xmm4, xmm5 /* xmm4 = sum+x1 */
  1677. psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
  1678. psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
  1679. paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
  1680. psraw xmm4, 4 /* xmm4 >>=4 */
  1681. psubw xmm4, xmm5 /* New Value - old Value */
  1682. pand xmm4, xmm7 /* And the flag */
  1683. paddw xmm4, xmm5 /* add the old value back */
  1684. packuswb xmm4, xmm0 /* pack it to bytes */
  1685. movdq2q mm0, xmm4 /* Write new x1 */
  1686. /* sum += x5 -p1 */
  1687. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  1688. movdqa xmm5, [edi+32] /* xmm5= x2 */
  1689. psubw xmm3, xmm1 /* sum=sum-p1 */
  1690. paddw xmm3, [edi+80] /* sum=sum+x5 */
  1691. movdqa xmm4, xmm5 /* copy sum */
  1692. paddw xmm4, xmm3 /* xmm4=sum+x2 */
  1693. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  1694. psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
  1695. paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
  1696. psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
  1697. psubw xmm4, xmm5 /* new value - old value */
  1698. pand xmm4, xmm7 /* And the flag */
  1699. paddw xmm4, xmm5 /* add the old value back */
  1700. packuswb xmm4, xmm0 /* pack it to bytes */
  1701. movdq2q mm1, xmm4 /* write new x2 */
  1702. /* sum += x6 - p1 */
  1703. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  1704. movdqa xmm5, [edi+48] /* xmm5= x3 */
  1705. psubw xmm3, xmm1 /* sum=sum-p1 */
  1706. paddw xmm3, [edi+96] /* sum=sum+x6 */
  1707. movdqa xmm4, xmm5 /* copy x3 */
  1708. paddw xmm4, xmm3 /* xmm4=sum+x3 */
  1709. paddw xmm4, xmm4 /* xmm4 <<= 1 */
  1710. psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
  1711. paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
  1712. psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
  1713. psubw xmm4, xmm5 /* new value - old value */
  1714. pand xmm4, xmm7 /* And the flag */
  1715. paddw xmm4, xmm5 /* add the old value back */
  1716. packuswb xmm4, xmm0 /* pack it to bytes */
  1717. movdq2q mm2, xmm4 /* write new x3 */
  1718. /* sum += x7 - p1 */
  1719. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  1720. movdqa xmm5, [edi+64] /* xmm5 = x4 */
  1721. psubw xmm3, xmm1 /* sum = sum-p1 */
  1722. paddw xmm3, [edi+112] /* sum = sum+x7 */
  1723. movdqa xmm4, xmm5 /* xmm4 = x4 */
  1724. paddw xmm4, xmm3 /* xmm4 = sum + x4 */
  1725. paddw xmm4, xmm4 /* xmm4 *=2 */
  1726. paddw xmm4, xmm1 /* += p1 */
  1727. psubw xmm4, [edi+16] /* -= x1 */
  1728. psubw xmm4, [edi+112] /* -= x7 */
  1729. paddw xmm4, [edi+128] /* += x8 */
  1730. movdqa xmm5, LoopFilteredValuesUp /* Read the loop filtered value of x4 */
  1731. psraw xmm4, 4 /* >>=4 */
  1732. psubw xmm4, xmm5 /* -=x4 */
  1733. pand xmm4, xmm7 /* and flag */
  1734. paddw xmm4, xmm5 /* += x4 */
  1735. packuswb xmm4, xmm0 /* pack it to bytes */
  1736. movdq2q mm3, xmm4 /* write new x4 */
  1737. /* sum+= x8-x1 */
  1738. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  1739. movdqa xmm5, [edi+80] /* xmm5 = x5 */
  1740. psubw xmm3, [edi+16] /* sum -= x1 */
  1741. paddw xmm3, [edi+128] /* sub += x8 */
  1742. movdqa xmm4, xmm5 /* xmm4 = x5 */
  1743. paddw xmm4, xmm3 /* xmm4= sum+x5 */
  1744. paddw xmm4, xmm4 /* xmm4 *= 2 */
  1745. paddw xmm4, [edi+16] /* += x1 */
  1746. psubw xmm4, [edi+32] /* -= x2 */
  1747. psubw xmm4, [edi+128] /* -= x8 */
  1748. paddw xmm4, xmm2 /* += p2 */
  1749. movdqa xmm5, LoopFilteredValuesDown /* Read the loop filtered value of x4 */
  1750. psraw xmm4, 4 /* >>=4 */
  1751. psubw xmm4, xmm5 /* -=x5 */
  1752. pand xmm4, xmm7 /* and flag */
  1753. paddw xmm4, xmm5 /* += x5 */
  1754. packuswb xmm4, xmm0 /* pack to bytes */
  1755. movdq2q mm4, xmm4 /* write new x5 */
  1756. /* sum += p2 - x2 */
  1757. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  1758. movdqa xmm5, [edi+96] /* xmm5 = x6 */
  1759. psubw xmm3, [edi+32] /* -= x2 */
  1760. paddw xmm3, xmm2 /* += p2 */
  1761. movdqa xmm4, xmm5 /* xmm4 = x6 */
  1762. paddw xmm4, xmm3 /* xmm4 = sum+x6 */
  1763. paddw xmm4, xmm4 /* xmm4 *= 2*/
  1764. paddw xmm4, [edi+32] /* +=x2 */
  1765. psubw xmm4, [edi+48] /* -=x3 */
  1766. psraw xmm4, 4 /* >>=4 */
  1767. psubw xmm4, xmm5 /* -=x6 */
  1768. pand xmm4, xmm7 /* and flag */
  1769. paddw xmm4, xmm5 /* += x6 */
  1770. packuswb xmm4, xmm0 /* pack to bytes */
  1771. movdq2q mm5, xmm4 /* write new x6 */
  1772. /* sum += p2 - x3 */
  1773. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  1774. movdqa xmm5, [edi+112] /* xmm5 = x7 */
  1775. psubw xmm3, [edi+48] /* -= x3 */
  1776. paddw xmm3, xmm2 /* += p2 */
  1777. movdqa xmm4, xmm5 /* xmm4 = x7 */
  1778. paddw xmm4, xmm3 /* xmm4 = sum+x7 */
  1779. paddw xmm4, xmm4 /* xmm4 *= 2*/
  1780. paddw xmm4, [edi+48] /* +=x3 */
  1781. psubw xmm4, [edi+64] /* -=x4 */
  1782. psraw xmm4, 4 /* >>=4 */
  1783. psubw xmm4, xmm5 /* -=x7 */
  1784. pand xmm4, xmm7 /* and flag */
  1785. paddw xmm4, xmm5 /* += x7 */
  1786. packuswb xmm4, xmm0 /* pack to bytes */
  1787. movdq2q mm6, xmm4 /* write new x7 */
  1788. /* sum += p2 - x4 */
  1789. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  1790. movdqa xmm5, [edi+128] /* xmm5 = x8 */
  1791. psubw xmm3, [edi+64] /* -= x4 */
  1792. paddw xmm3, xmm2 /* += p2 */
  1793. movdqa xmm4, xmm5 /* xmm4 = x8 */
  1794. paddw xmm4, xmm3 /* xmm4 = sum+x8 */
  1795. paddw xmm4, xmm4 /* xmm4 *= 2*/
  1796. paddw xmm4, [edi+64] /* +=x4 */
  1797. psubw xmm4, [edi+80] /* -=x5 */
  1798. psraw xmm4, 4 /* >>=4 */
  1799. psubw xmm4, xmm5 /* -=x8 */
  1800. pand xmm4, xmm7 /* and flag */
  1801. paddw xmm4, xmm5 /* += x8 */
  1802. packuswb xmm4, xmm0 /* pack to bytes */
  1803. movdq2q mm7, xmm4 /* write new x8 */
  1804. /* transpose */
  1805. movq2dq xmm0, mm0 /* xmm0 = 70 60 50 40 30 20 10 00 */
  1806. movq2dq xmm1, mm1 /* xmm1 = 71 61 51 41 31 21 11 01 */
  1807. movq2dq xmm2, mm2 /* xmm2 = 72 62 52 42 32 22 12 02 */
  1808. movq2dq xmm3, mm3 /* xmm3 = 73 63 53 43 33 23 13 03 */
  1809. punpcklbw xmm0, xmm1 /* xmm0 = 7170 6160 5150 4140 3130 2120 1110 0100 */
  1810. punpcklbw xmm2, xmm3 /* xmm2 = 7372 6362 5352 4342 3332 2322 1312 0302 */
  1811. movdqa xmm1, xmm0 /* xmm1 = 7170 6160 5150 4140 3130 2120 1110 0100 */
  1812. punpcklwd xmm0, xmm2 /* xmm0 = 33323130 23222120 13121110 03020100 */
  1813. punpckhwd xmm1, xmm2 /* xmm1 = 73727170 63626160 53525150 43424140 */
  1814. movq2dq xmm4, mm4 /* xmm4 = 74 64 54 44 34 24 14 04 */
  1815. movq2dq xmm5, mm5 /* xmm5 = 75 65 55 45 35 25 15 05 */
  1816. movq2dq xmm6, mm6 /* xmm6 = 76 66 56 46 36 26 16 06 */
  1817. movq2dq xmm7, mm7 /* xmm7 = 77 67 57 47 37 27 17 07 */
  1818. punpcklbw xmm4, xmm5 /* xmm4 = 7574 6564 5554 4544 3534 2524 1514 0504 */
  1819. punpcklbw xmm6, xmm7 /* xmm6 = 7776 6766 5756 4746 3736 2726 1716 0706 */
  1820. movdqa xmm5, xmm4 /* xmm5 = 7574 6564 5554 4544 3534 2524 1514 0504 */
  1821. punpcklwd xmm4, xmm6 /* xmm4 = 37363534 27262524 17161514 07060504 */
  1822. punpckhwd xmm5, xmm6 /* xmm5 = 77767574 67666564 57565554 47464544 */
  1823. movdqa xmm2, xmm0 /* xmm2 = 33323130 23222120 13121110 03020100 */
  1824. punpckldq xmm0, xmm4 /* xmm0 = 1716151413121110 0706050403020100 */
  1825. movq QWORD PTR [esi+edx*2],xmm0 /* write 00 01 02 03 04 05 06 07 */
  1826. psrldq xmm0, 8 /* xmm0 = 1716151413121110 */
  1827. punpckhdq xmm2, xmm4 /* xmm2 = 3736353433323130 2726252423222120 */
  1828. movq QWORD PTR [esi+edx], xmm0 /* write 10 11 12 13 14 15 16 17 */
  1829. movdqa xmm3, xmm1 /* xmm3 = 73727170 63626160 53525150 43424140 */
  1830. punpckldq xmm1, xmm5 /* xmm1 = 5756555453525150 4746454443424140 */
  1831. movq QWORD PTR [esi], xmm2 /* write 20 21 22 23 24 25 26 27 */
  1832. psrldq xmm2, 8 /* xmm2 = 3736353433323130 */
  1833. punpckhdq xmm3, xmm5 /* xmm3 = 7776757473727170 6766656463626160 */
  1834. movq QWORD PTR [esi+ecx], xmm2 /* write 30 31 32 33 34 35 36 37 */
  1835. lea esi, [esi+ecx*4] /* esi= Des - 4 + 4 *pitch */
  1836. movq QWORD PTR [esi+edx*2], xmm1 /* write 40 41 42 43 44 45 46 47 */
  1837. movq QWORD PTR [esi], xmm3 /* write 60 61 62 63 64 65 66 67 */
  1838. psrldq xmm1, 8 /* xmm1 = 5756555453525150 */
  1839. psrldq xmm3, 8 /* xmm3 = 7776757473727170 */
  1840. movq QWORD PTR [esi+edx], xmm1 /* write 50 51 52 53 54 55 56 57 */
  1841. movq QWORD PTR [esi+ecx], xmm3 /* write 70 71 72 73 74 75 76 77 */
  1842. pop edi
  1843. pop esi
  1844. pop edx
  1845. pop ecx
  1846. pop eax
  1847. }// end of __asm
  1848. Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
  1849. Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
  1850. pbi->FragmentVariances[CurrentFrag-1] += Var1;
  1851. pbi->FragmentVariances[CurrentFrag] += Var2;
  1852. CurrentFrag ++;
  1853. }//else
  1854. }//while
  1855. #endif
  1856. }
  1857. /****************************************************************************
  1858. *
  1859. * ROUTINE : PlaneAddNoise_wmt
  1860. *
  1861. * INPUTS : UINT8 *Start starting address of buffer to add gaussian
  1862. * noise to
  1863. * UINT32 Width width of plane
  1864. * UINT32 Height height of plane
  1865. * INT32 Pitch distance between subsequent lines of frame
  1866. * INT32 q quantizer used to determine amount of noise
  1867. * to add
  1868. *
  1869. * OUTPUTS : None.
  1870. *
  1871. * RETURNS : void.
  1872. *
  1873. * FUNCTION : adds gaussian noise to a plane of pixels
  1874. *
  1875. * SPECIAL NOTES : None.
  1876. *
  1877. ****************************************************************************/
  1878. void PlaneAddNoise_wmt( UINT8 *Start, UINT32 Width, UINT32 Height, INT32 Pitch, int q)
  1879. {
  1880. unsigned int i;
  1881. INT32 Pitch4 = Pitch * 4;
  1882. const int noiseAmount = 2;
  1883. const int noiseAdder = 2 * noiseAmount + 1;
  1884. #if defined(_WIN32_WCE)
  1885. return;
  1886. #else
  1887. __declspec(align(16)) unsigned char blackclamp[16];
  1888. __declspec(align(16)) unsigned char whiteclamp[16];
  1889. __declspec(align(16)) unsigned char bothclamp[16];
  1890. char CharDist[300];
  1891. char Rand[2048];
  1892. double sigma;
  1893. // return;
  1894. __asm emms
  1895. sigma = 1 + .8*(63-q) / 63.0;
  1896. // set up a lookup table of 256 entries that matches
  1897. // a gaussian distribution with sigma determined by q.
  1898. //
  1899. {
  1900. double i,sum=0;
  1901. int next,j;
  1902. next=0;
  1903. for(i=-32;i<32;i++)
  1904. {
  1905. int a = (int)(.5+256*gaussian(sigma,0,i));
  1906. if(a)
  1907. {
  1908. for(j=0;j<a;j++)
  1909. {
  1910. CharDist[next+j]=(char) i;
  1911. }
  1912. next = next+j;
  1913. }
  1914. }
  1915. for(next=next;next<256;next++)
  1916. CharDist[next] = 0;
  1917. }
  1918. for(i=0;i<2048;i++)
  1919. {
  1920. Rand[i]=CharDist[rand() & 0xff];
  1921. }
  1922. for(i=0;i<16;i++)
  1923. {
  1924. blackclamp[i]=-CharDist[0];
  1925. whiteclamp[i]=-CharDist[0];
  1926. bothclamp[i]=-2*CharDist[0];
  1927. }
  1928. for(i=0;i<Height;i++)
  1929. {
  1930. UINT8 *Pos = Start + i *Pitch;
  1931. INT8 *Ref = Rand + (rand() & 0xff);
  1932. __asm
  1933. {
  1934. mov ecx, [Width]
  1935. mov esi,Pos
  1936. mov edi,Ref
  1937. xor eax,eax
  1938. nextset:
  1939. movdqu xmm1,[esi+eax] // get the source
  1940. psubusb xmm1,blackclamp // clamp both sides so we don't outrange adding noise
  1941. paddusb xmm1,bothclamp
  1942. psubusb xmm1,whiteclamp
  1943. movdqu xmm2,[edi+eax] // get the noise for this line
  1944. paddb xmm1,xmm2 // add it in
  1945. movdqu [esi+eax],xmm1 // store the result
  1946. add eax,16 // move to the next line
  1947. cmp eax, ecx
  1948. jl nextset
  1949. }
  1950. }
  1951. #endif
  1952. }