blending.h 13 KB


  1. #ifndef _BLENDING_H_
  2. #define _BLENDING_H_
  3. //#include <bfc/common.h>
  4. #include <bfc/platform/types.h>
  5. class Blenders
  6. {
  7. public:
  8. static void init();
  9. static unsigned int inline BLEND_ADJ1(unsigned int a, unsigned int b, int alpha);
  10. static unsigned int inline BLEND_ADJ2(unsigned int a, unsigned int b);
  11. static unsigned int inline BLEND_ADJ3(unsigned int a, unsigned int b, int alpha);
  12. static unsigned int inline BLEND_MUL(unsigned int a, int v);
  13. static unsigned int inline BLEND_AVG(unsigned int a, unsigned int b);
  14. static unsigned int inline BLEND4(unsigned int *p1, unsigned int w, int xp, int yp);
  15. #ifndef NO_MMX
  16. static int inline MMX_AVAILABLE() { return mmx_available; }
  17. static unsigned int inline BLEND_ADJ1_MMX(unsigned int a, unsigned int b, int alpha);
  18. static unsigned int inline BLEND_ADJ2_MMX(unsigned int a, unsigned int b);
  19. static unsigned int inline BLEND_ADJ3_MMX(unsigned int a, unsigned int b, int alpha);
  20. static unsigned int inline BLEND_MUL_MMX(unsigned int a, int v);
  21. static unsigned int inline BLEND_AVG_MMX(unsigned int a, unsigned int b);
  22. static unsigned int inline BLEND4_MMX(unsigned int *p1, unsigned int w, int xp, int yp);
  23. static void inline BLEND_MMX_END()
  24. {
  25. #ifdef WIN32
  26. if (mmx_available) __asm emms;
  27. #endif
  28. #ifdef LINUX
  29. if (mmx_available) __asm__ volatile ( "emms" : : );
  30. #endif
  31. }
  32. #endif
  33. //private:
  34. static uint8_t alphatable[256][256];
  35. #ifndef NO_MMX
  36. static int mmx_available;
  37. #endif
  38. };
  39. // NON MMX
  40. // average blend of a and b.
  41. unsigned int inline Blenders::BLEND_AVG(unsigned int a, unsigned int b)
  42. {
  43. return ((a >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))) + ((b >> 1)&~((1 << 7) | (1 << 15) | (1 << 23)));
  44. }
  45. // multiplies 32 bit color A by scalar V (0-255)
  46. unsigned int inline Blenders::BLEND_MUL(unsigned int a, int v)
  47. {
  48. register int t;
  49. t = Blenders::alphatable[a & 0xFF][v];
  50. t |= Blenders::alphatable[(a & 0xFF00) >> 8][v] << 8;
  51. t |= Blenders::alphatable[(a & 0xFF0000) >> 16][v] << 16;
  52. t |= Blenders::alphatable[(a & 0xFF000000) >> 24][v] << 24;
  53. return t;
  54. }
  55. // V is scalar (0-255), (1.0-V)*b + V*a
  56. unsigned int inline Blenders::BLEND_ADJ1(unsigned int a, unsigned int b, int v)
  57. {
  58. register int t;
  59. t = Blenders::alphatable[b & 0xFF][0xFF - v] + Blenders::alphatable[a & 0xFF][v];
  60. t |= (Blenders::alphatable[(b & 0xFF00) >> 8][0xFF - v] + Blenders::alphatable[(a & 0xFF00) >> 8][v]) << 8;
  61. t |= (Blenders::alphatable[(b & 0xFF0000) >> 16][0xFF - v] + Blenders::alphatable[(a & 0xFF0000) >> 16][v]) << 16;
  62. t |= (Blenders::alphatable[(b & 0xFF000000) >> 24][0xFF - v] + Blenders::alphatable[(a & 0xFF000000) >> 24][v]) << 24;
  63. return t;
  64. }
  65. // returns a*(1.0-Alpha(b)) + b
  66. unsigned int inline Blenders::BLEND_ADJ2(unsigned int a, unsigned int b)
  67. {
  68. register int t, z;
  69. int v = 0xff - ((b >> 24) & 0xff);
  70. t = Blenders::alphatable[a & 0xFF][v] + (b & 0xFF);
  71. if (t > 0xFF) t = 0xff;
  72. z = (Blenders::alphatable[(a & 0xFF00) >> 8][v] << 8) + (b & 0xFF00);
  73. if (z > 0xFF00) z = 0xff00;
  74. t |= z;
  75. z = (Blenders::alphatable[(a & 0xFF0000) >> 16][v] << 16) + ((b & 0xFF0000));
  76. if (z > 0xFF0000) z = 0xff0000;
  77. t |= z;
  78. z = (Blenders::alphatable[(a & 0xFF000000) >> 24][v]) + ((b & 0xFF000000) >> 24);
  79. if (z > 0xFF) z = 0xff;
  80. return t | (z << 24);
  81. }
  82. // returns a*(1-Alpha(b)*W) + b*W, clamped (W is scalar 0-0xff).
  83. unsigned int inline Blenders::BLEND_ADJ3(unsigned int a, unsigned int b, int w)
  84. {
  85. register int t, z;
  86. int v = 0xff - Blenders::alphatable[(b >> 24) & 0xff][w];
  87. t = Blenders::alphatable[a & 0xFF][v] + Blenders::alphatable[b & 0xFF][w];
  88. if (t > 0xFF) t = 0xFF;
  89. z = Blenders::alphatable[(a & 0xFF00) >> 8][v] + Blenders::alphatable[(b & 0xFF00) >> 8][w];
  90. if (z > 0xFF) z = 0xFF;
  91. t |= z << 8;
  92. z = Blenders::alphatable[(a & 0xFF0000) >> 16][v] + Blenders::alphatable[(b & 0xFF0000) >> 16][w];
  93. if (z > 0xFF) z = 0xFF;
  94. t |= z << 16;
  95. z = Blenders::alphatable[(a & 0xFF000000) >> 24][v] + Blenders::alphatable[(b & 0xFF000000) >> 24][w];
  96. if (z > 0xFF) z = 0xFF;
  97. return t | (z << 24);
  98. }
  99. unsigned int __inline Blenders::BLEND4(unsigned int *p1, unsigned int w, int xp, int yp)
  100. {
  101. register int t;
  102. uint8_t a1, a2, a3, a4;
  103. xp = (xp >> 8) & 0xff;
  104. yp = (yp >> 8) & 0xff;
  105. a1 = alphatable[255 - xp][255 - yp];
  106. a2 = alphatable[xp][255 - yp];
  107. a3 = alphatable[255 - xp][yp];
  108. a4 = alphatable[xp][yp];
  109. t = alphatable[p1[0] & 0xff][a1] + alphatable[p1[1] & 0xff][a2] + alphatable[p1[w] & 0xff][a3] + alphatable[p1[w + 1] & 0xff][a4];
  110. t |= (alphatable[(p1[0] >> 8) & 0xff][a1] + alphatable[(p1[1] >> 8) & 0xff][a2] + alphatable[(p1[w] >> 8) & 0xff][a3] + alphatable[(p1[w + 1] >> 8) & 0xff][a4]) << 8;
  111. t |= (alphatable[(p1[0] >> 16) & 0xff][a1] + alphatable[(p1[1] >> 16) & 0xff][a2] + alphatable[(p1[w] >> 16) & 0xff][a3] + alphatable[(p1[w + 1] >> 16) & 0xff][a4]) << 16;
  112. t |= (alphatable[(p1[0] >> 24) & 0xff][a1] + alphatable[(p1[1] >> 24) & 0xff][a2] + alphatable[(p1[w] >> 24) & 0xff][a3] + alphatable[(p1[w + 1] >> 24) & 0xff][a4]) << 24;
  113. return t;
  114. }
  115. #ifndef NO_MMX
  116. #ifdef WIN32
  117. #pragma warning( push, 1 )
  118. #pragma warning(disable: 4799)
  119. #endif
  120. #ifdef WIN32
  121. #define MMX_CONST const
  122. #else
  123. #define MMX_CONST
  124. #endif
  125. static unsigned int MMX_CONST Blenders__mmx_revn2[2] = {0x01000100, 0x01000100};
  126. static unsigned int MMX_CONST Blenders__mmx_zero[2];
  127. static unsigned int MMX_CONST Blenders__mmx_one[2] = {1, 0};
  128. #undef MMX_CONST
  129. /// MMX
  130. // average blend of a and b.
  131. unsigned int inline Blenders::BLEND_AVG_MMX(unsigned int a, unsigned int b)
  132. {
  133. return ((a >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))) + ((b >> 1)&~((1 << 7) | (1 << 15) | (1 << 23)));
  134. }
  135. // multiplies 32 bit color A by scalar V (0-255)
  136. unsigned int inline Blenders::BLEND_MUL_MMX(unsigned int a, int v)
  137. {
  138. #ifdef WIN32
  139. __asm
  140. {
  141. movd mm3, [v] // VVVVVVVV
  142. movd mm0, [a]
  143. packuswb mm3, mm3 // 0000HHVV
  144. punpcklbw mm0, [Blenders__mmx_zero]
  145. punpcklwd mm3, mm3 // HHVVHHVV
  146. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  147. pmullw mm0, mm3
  148. psrlw mm0, 8
  149. packuswb mm0, mm0
  150. movd eax, mm0
  151. }
  152. #else
  153. __asm__ volatile (
  154. "movd %0, %%mm3\n"
  155. "movd %1, %%mm0\n"
  156. "packuswb %%mm3, %%mm3\n"
  157. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  158. "punpcklwd %%mm3, %%mm3\n"
  159. "punpckldq %%mm3, %%mm3\n"
  160. "pmullw %%mm3, %%mm0\n"
  161. "psrlw $8, %%mm0\n"
  162. "packuswb %%mm0, %%mm0\n"
  163. "movd %%mm0, %%eax\n"
  164. :
  165. : "m" (v), "m" (a)
  166. : "%mm0", "%mm3" );
  167. #endif
  168. }
  169. // V is scalar (0-255), (1.0-V)*b + V*a
  170. unsigned int inline Blenders::BLEND_ADJ1_MMX(unsigned int a, unsigned int b, int v)
  171. {
  172. #ifdef WIN32
  173. __asm
  174. {
  175. movd mm3, [v] // VVVVVVVV
  176. movd mm0, [a]
  177. packuswb mm3, mm3 // 0000HHVV
  178. movd mm1, [b]
  179. paddusw mm3, [Blenders__mmx_one]
  180. movq mm4, [Blenders__mmx_revn2]
  181. punpcklwd mm3, mm3 // HHVVHHVV
  182. punpcklbw mm0, [Blenders__mmx_zero]
  183. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  184. punpcklbw mm1, [Blenders__mmx_zero]
  185. psubw mm4, mm3
  186. pmullw mm0, mm3
  187. pmullw mm1, mm4
  188. paddw mm0, mm1
  189. psrlw mm0, 8
  190. packuswb mm0, mm0
  191. movd eax, mm0
  192. }
  193. #else
  194. __asm__ volatile (
  195. "movd %0, %%mm3\n"
  196. "movd %1, %%mm0\n"
  197. "packuswb %%mm3, %%mm3\n"
  198. "movd %2, %%mm1\n"
  199. "paddusw (Blenders__mmx_one), %%mm3\n"
  200. "movq (Blenders__mmx_revn2), %%mm4\n"
  201. "punpcklwd %%mm3, %%mm3\n"
  202. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  203. "punpckldq %%mm3, %%mm3\n"
  204. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  205. "psubw %%mm3, %%mm4\n"
  206. "pmullw %%mm3, %%mm0\n"
  207. "pmullw %%mm4, %%mm1\n"
  208. "paddw %%mm1, %%mm0\n"
  209. "psrlw $8, %%mm0\n"
  210. "packuswb %%mm0, %%mm0\n"
  211. "movd %%mm0, %%eax\n"
  212. :
  213. : "m" (v), "m" (a), "m" (b)
  214. : "%mm0", "%mm1", "%mm3", "%mm4" );
  215. #endif
  216. }
  217. // returns a*(1.0-Alpha(b)) + b
  218. unsigned int inline Blenders::BLEND_ADJ2_MMX(unsigned int a, unsigned int b)
  219. {
  220. #ifdef WIN32
  221. __asm
  222. {
  223. movd mm3, [b] // VVVVVVVV
  224. movq mm4, [Blenders__mmx_revn2]
  225. movd mm0, [a]
  226. psrld mm3, 24
  227. movd mm1, [b]
  228. paddusw mm3, [Blenders__mmx_one]
  229. punpcklwd mm3, mm3 // HHVVHHVV
  230. punpcklbw mm0, [Blenders__mmx_zero]
  231. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  232. punpcklbw mm1, [Blenders__mmx_zero]
  233. psubw mm4, mm3
  234. pmullw mm0, mm4
  235. // stall
  236. // stall
  237. // stall
  238. psrlw mm0, 8
  239. // stall
  240. paddw mm0, mm1
  241. // stall
  242. packuswb mm0, mm0
  243. // stall
  244. movd eax, mm0
  245. }
  246. #else
  247. __asm__ volatile (
  248. "movd %1, %%mm3\n"
  249. "movq (Blenders__mmx_revn2), %%mm4\n"
  250. "movd %0, %%mm0\n"
  251. "psrld $24, %%mm3\n"
  252. "movd %1, %%mm1\n"
  253. "paddusw (Blenders__mmx_one), %%mm3\n"
  254. "punpcklwd %%mm3, %%mm3\n"
  255. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  256. "punpckldq %%mm3, %%mm3\n"
  257. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  258. "psubw %%mm3, %%mm4\n"
  259. "pmullw %%mm4, %%mm0\n"
  260. "psrlw $8, %%mm0\n"
  261. "paddw %%mm1, %%mm0\n"
  262. "packuswb %%mm0, %%mm0\n"
  263. "movd %%mm0, %%eax\n"
  264. :
  265. : "m" (a), "m" (b)
  266. : "%esi", "%mm0", "%mm1", "%mm3", "%mm4" );
  267. #endif
  268. }
  269. // returns a*(1-Alpha(b)*W) + b*W, clamped (W is scalar 0-0xff).
  270. unsigned int inline Blenders::BLEND_ADJ3_MMX(unsigned int a, unsigned int b, int w)
  271. {
  272. #ifdef WIN32
  273. __asm
  274. {
  275. movd mm3, [b] // VVVVVVVV
  276. movd mm5, [w]
  277. movd mm0, [a]
  278. psrld mm3, 24
  279. movd mm1, [b]
  280. paddusw mm3, [Blenders__mmx_one]
  281. movq mm4, [Blenders__mmx_revn2]
  282. pmullw mm3, mm5
  283. packuswb mm5, mm5
  284. punpcklbw mm0, [Blenders__mmx_zero]
  285. punpcklwd mm5, mm5
  286. punpcklbw mm1, [Blenders__mmx_zero]
  287. psrlw mm3, 8
  288. punpckldq mm5, mm5
  289. paddusw mm3, [Blenders__mmx_one]
  290. punpcklwd mm3, mm3 // HHVVHHVV
  291. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  292. psubw mm4, mm3
  293. pmullw mm0, mm4
  294. pmullw mm1, mm5
  295. paddusw mm0, mm1
  296. psrlw mm0, 8
  297. packuswb mm0, mm0
  298. movd eax, mm0
  299. }
  300. #else
  301. __asm__ volatile (
  302. "movd %2, %%mm3\n"
  303. "movd %0, %%mm5\n"
  304. "movd %1, %%mm0\n"
  305. "psrld $24, %%mm3\n"
  306. "movd %2, %%mm1\n"
  307. "paddusw (Blenders__mmx_one), %%mm3\n"
  308. "movq (Blenders__mmx_revn2), %%mm4\n"
  309. "pmullw %%mm5, %%mm3\n"
  310. "packuswb %%mm5, %%mm5 \n"
  311. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  312. "punpcklwd %%mm5, %%mm5\n"
  313. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  314. "psrlw $8, %%mm3\n"
  315. "punpckldq %%mm5, %%mm5\n"
  316. "paddusw (Blenders__mmx_one), %%mm3\n"
  317. "punpcklwd %%mm3, %%mm3\n"
  318. "punpckldq %%mm3, %%mm3\n"
  319. "psubw %%mm3, %%mm4\n"
  320. "pmullw %%mm4, %%mm0\n"
  321. "pmullw %%mm5, %%mm1\n"
  322. "paddusw %%mm1, %%mm0\n"
  323. "psrlw $8, %%mm0\n"
  324. "packuswb %%mm0, %%mm0\n"
  325. "movd %%mm0, %%eax\n"
  326. :
  327. : "m" (w), "m" (a), "m" (b)
  328. : "%mm0", "%mm1", "%mm4", "%mm3", "%mm5" );
  329. #endif
  330. }
  331. // does bilinear filtering. p1 is upper left pixel, w is width of framebuffer
  332. // xp and yp's low 16 bits are used for the subpixel positioning.
  333. unsigned int inline Blenders::BLEND4_MMX(unsigned int *p1, unsigned int w, int xp, int yp)
  334. {
  335. #ifdef WIN32
  336. __asm
  337. {
  338. movd mm6, xp
  339. mov eax, p1
  340. movd mm7, yp
  341. mov esi, w
  342. movq mm4, Blenders__mmx_revn2
  343. psrlw mm6, 8
  344. movq mm5, Blenders__mmx_revn2
  345. psrlw mm7, 8
  346. movd mm0, [eax]
  347. punpcklwd mm6, mm6
  348. movd mm1, [eax + 4]
  349. punpcklwd mm7, mm7
  350. movd mm2, [eax + esi*4]
  351. punpckldq mm6, mm6
  352. movd mm3, [eax + esi*4 + 4]
  353. punpckldq mm7, mm7
  354. punpcklbw mm0, [Blenders__mmx_zero]
  355. psubw mm4, mm6
  356. punpcklbw mm1, [Blenders__mmx_zero]
  357. pmullw mm0, mm4
  358. punpcklbw mm2, [Blenders__mmx_zero]
  359. pmullw mm1, mm6
  360. punpcklbw mm3, [Blenders__mmx_zero]
  361. psubw mm5, mm7
  362. pmullw mm2, mm4
  363. pmullw mm3, mm6
  364. paddw mm0, mm1
  365. // stall (mm0)
  366. psrlw mm0, 8
  367. // stall (waiting for mm3/mm2)
  368. paddw mm2, mm3
  369. pmullw mm0, mm5
  370. psrlw mm2, 8
  371. // stall (mm2)
  372. pmullw mm2, mm7
  373. // stall
  374. // stall (mm2)
  375. paddw mm0, mm2
  376. // stall
  377. psrlw mm0, 8
  378. // stall
  379. packuswb mm0, mm0
  380. // stall
  381. movd eax, mm0
  382. }
  383. #else
  384. __asm__ volatile (
  385. "movd %2, %%mm6\n"
  386. "mov %0, %%eax\n"
  387. "movd %3, %%mm7\n"
  388. "mov %1, %%esi\n"
  389. "movq (Blenders__mmx_revn2), %%mm4\n"
  390. "psrlw $8, %%mm6\n"
  391. "movq (Blenders__mmx_revn2), %%mm5\n"
  392. "psrlw $8, %%mm7\n"
  393. "movd (%%eax), %%mm0\n"
  394. "punpcklwd %%mm6,%%mm6\n"
  395. "movd 4(%%eax), %%mm1\n"
  396. "punpcklwd %%mm7,%%mm7\n"
  397. "movd (%%eax,%%esi,4), %%mm2\n"
  398. "punpckldq %%mm6,%%mm6\n"
  399. "movd 4(%%eax,%%esi,4), %%mm3\n"
  400. "punpckldq %%mm7,%%mm7\n"
  401. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  402. "psubw %%mm6, %%mm4\n"
  403. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  404. "pmullw %%mm4, %%mm0\n"
  405. "punpcklbw (Blenders__mmx_zero), %%mm2\n"
  406. "pmullw %%mm6, %%mm1\n"
  407. "punpcklbw (Blenders__mmx_zero), %%mm3\n"
  408. "psubw %%mm7, %%mm5\n"
  409. "pmullw %%mm4, %%mm2\n"
  410. "pmullw %%mm6, %%mm3\n"
  411. "paddw %%mm1, %%mm0\n"
  412. "psrlw $8, %%mm0\n"
  413. "paddw %%mm3, %%mm2\n"
  414. "pmullw %%mm5, %%mm0\n"
  415. "psrlw $8, %%mm2\n"
  416. "pmullw %%mm7, %%mm2\n"
  417. "paddw %%mm2, %%mm0\n"
  418. "psrlw $8, %%mm0\n"
  419. "packuswb %%mm0, %%mm0\n"
  420. "movd %%mm0, %%eax\n"
  421. :
  422. : "m" (p1), "m" (w), "m" (xp), "m" (yp)
  423. : "%mm0", "%mm1", "%mm4", "%mm3", "%mm5" );
  424. #endif
  425. }
  426. #ifdef WIN32
  427. #pragma warning( pop )
  428. #endif
  429. #endif // ndef NO_MMX
  430. #endif