blending.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. #ifndef _BLENDING_H_
  2. #define _BLENDING_H_
  3. //#include <bfc/common.h>
  4. #include <bfc/platform/types.h>
  5. #include <tataki/export.h>
  6. #ifdef _WIN64
  7. #define NO_MMX
  8. #endif
  9. class TATAKIAPI Blenders
  10. {
  11. public:
  12. static void init();
  13. static unsigned int inline BLEND_ADJ1(unsigned int a, unsigned int b, int alpha);
  14. static unsigned int inline BLEND_ADJ2(unsigned int a, unsigned int b);
  15. static unsigned int inline BLEND_ADJ3(unsigned int a, unsigned int b, int alpha);
  16. static unsigned int inline BLEND_MUL(unsigned int a, int v);
  17. static unsigned int inline BLEND_AVG(unsigned int a, unsigned int b);
  18. static unsigned int inline BLEND4(unsigned int *p1, unsigned int w, int xp, int yp);
  19. #ifndef NO_MMX
  20. static int inline MMX_AVAILABLE() { return mmx_available; }
  21. static unsigned int inline BLEND_ADJ1_MMX(unsigned int a, unsigned int b, int alpha);
  22. static unsigned int inline BLEND_ADJ2_MMX(unsigned int a, unsigned int b);
  23. static unsigned int inline BLEND_ADJ3_MMX(unsigned int a, unsigned int b, int alpha);
  24. static unsigned int inline BLEND_MUL_MMX(unsigned int a, int v);
  25. static unsigned int inline BLEND_AVG_MMX(unsigned int a, unsigned int b);
  26. static unsigned int inline BLEND4_MMX(unsigned int *p1, unsigned int w, int xp, int yp);
  27. static void inline BLEND_MMX_END()
  28. {
  29. #if defined(WIN32) && !defined(_WIN64)
  30. if (mmx_available) __asm emms;
  31. #endif
  32. #ifdef LINUX
  33. if (mmx_available) __asm__ volatile ( "emms" : : );
  34. #endif
  35. }
  36. #endif
  37. //private:
  38. static uint8_t alphatable[256][256];
  39. #ifndef NO_MMX
  40. static int mmx_available;
  41. #endif
  42. };
  43. // NON MMX
  44. // average blend of a and b.
  45. unsigned int inline Blenders::BLEND_AVG(unsigned int a, unsigned int b)
  46. {
  47. return ((a >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))) + ((b >> 1)&~((1 << 7) | (1 << 15) | (1 << 23)));
  48. }
  49. // multiplies 32 bit color A by scalar V (0-255)
  50. unsigned int inline Blenders::BLEND_MUL(unsigned int a, int v)
  51. {
  52. register int t;
  53. t = Blenders::alphatable[a & 0xFF][v];
  54. t |= Blenders::alphatable[(a & 0xFF00) >> 8][v] << 8;
  55. t |= Blenders::alphatable[(a & 0xFF0000) >> 16][v] << 16;
  56. t |= Blenders::alphatable[(a & 0xFF000000) >> 24][v] << 24;
  57. return t;
  58. }
  59. // V is scalar (0-255), (1.0-V)*b + V*a
  60. unsigned int inline Blenders::BLEND_ADJ1(unsigned int a, unsigned int b, int v)
  61. {
  62. register int t;
  63. t = Blenders::alphatable[b & 0xFF][0xFF - v] + Blenders::alphatable[a & 0xFF][v];
  64. t |= (Blenders::alphatable[(b & 0xFF00) >> 8][0xFF - v] + Blenders::alphatable[(a & 0xFF00) >> 8][v]) << 8;
  65. t |= (Blenders::alphatable[(b & 0xFF0000) >> 16][0xFF - v] + Blenders::alphatable[(a & 0xFF0000) >> 16][v]) << 16;
  66. t |= (Blenders::alphatable[(b & 0xFF000000) >> 24][0xFF - v] + Blenders::alphatable[(a & 0xFF000000) >> 24][v]) << 24;
  67. return t;
  68. }
  69. // returns a*(1.0-Alpha(b)) + b
  70. unsigned int inline Blenders::BLEND_ADJ2(unsigned int a, unsigned int b)
  71. {
  72. register int t, z;
  73. int v = 0xff - ((b >> 24) & 0xff);
  74. t = Blenders::alphatable[a & 0xFF][v] + (b & 0xFF);
  75. if (t > 0xFF) t = 0xff;
  76. z = (Blenders::alphatable[(a & 0xFF00) >> 8][v] << 8) + (b & 0xFF00);
  77. if (z > 0xFF00) z = 0xff00;
  78. t |= z;
  79. z = (Blenders::alphatable[(a & 0xFF0000) >> 16][v] << 16) + ((b & 0xFF0000));
  80. if (z > 0xFF0000) z = 0xff0000;
  81. t |= z;
  82. z = (Blenders::alphatable[(a & 0xFF000000) >> 24][v]) + ((b & 0xFF000000) >> 24);
  83. if (z > 0xFF) z = 0xff;
  84. return t | (z << 24);
  85. }
  86. // returns a*(1-Alpha(b)*W) + b*W, clamped (W is scalar 0-0xff).
  87. unsigned int inline Blenders::BLEND_ADJ3(unsigned int a, unsigned int b, int w)
  88. {
  89. register int t, z;
  90. int v = 0xff - Blenders::alphatable[(b >> 24) & 0xff][w];
  91. t = Blenders::alphatable[a & 0xFF][v] + Blenders::alphatable[b & 0xFF][w];
  92. if (t > 0xFF) t = 0xFF;
  93. z = Blenders::alphatable[(a & 0xFF00) >> 8][v] + Blenders::alphatable[(b & 0xFF00) >> 8][w];
  94. if (z > 0xFF) z = 0xFF;
  95. t |= z << 8;
  96. z = Blenders::alphatable[(a & 0xFF0000) >> 16][v] + Blenders::alphatable[(b & 0xFF0000) >> 16][w];
  97. if (z > 0xFF) z = 0xFF;
  98. t |= z << 16;
  99. z = Blenders::alphatable[(a & 0xFF000000) >> 24][v] + Blenders::alphatable[(b & 0xFF000000) >> 24][w];
  100. if (z > 0xFF) z = 0xFF;
  101. return t | (z << 24);
  102. }
  103. unsigned int __inline Blenders::BLEND4(unsigned int *p1, unsigned int w, int xp, int yp)
  104. {
  105. register int t;
  106. uint8_t a1, a2, a3, a4;
  107. xp = (xp >> 8) & 0xff;
  108. yp = (yp >> 8) & 0xff;
  109. a1 = alphatable[255 - xp][255 - yp];
  110. a2 = alphatable[xp][255 - yp];
  111. a3 = alphatable[255 - xp][yp];
  112. a4 = alphatable[xp][yp];
  113. t = alphatable[p1[0] & 0xff][a1] + alphatable[p1[1] & 0xff][a2] + alphatable[p1[w] & 0xff][a3] + alphatable[p1[w + 1] & 0xff][a4];
  114. t |= (alphatable[(p1[0] >> 8) & 0xff][a1] + alphatable[(p1[1] >> 8) & 0xff][a2] + alphatable[(p1[w] >> 8) & 0xff][a3] + alphatable[(p1[w + 1] >> 8) & 0xff][a4]) << 8;
  115. t |= (alphatable[(p1[0] >> 16) & 0xff][a1] + alphatable[(p1[1] >> 16) & 0xff][a2] + alphatable[(p1[w] >> 16) & 0xff][a3] + alphatable[(p1[w + 1] >> 16) & 0xff][a4]) << 16;
  116. t |= (alphatable[(p1[0] >> 24) & 0xff][a1] + alphatable[(p1[1] >> 24) & 0xff][a2] + alphatable[(p1[w] >> 24) & 0xff][a3] + alphatable[(p1[w + 1] >> 24) & 0xff][a4]) << 24;
  117. return t;
  118. }
  119. #ifndef NO_MMX
  120. #ifdef WIN32
  121. #pragma warning( push, 1 )
  122. #pragma warning(disable: 4799)
  123. #endif
  124. #ifdef WIN32
  125. #define MMX_CONST const
  126. #else
  127. #define MMX_CONST
  128. #endif
  129. static unsigned int MMX_CONST Blenders__mmx_revn2[2] = {0x01000100, 0x01000100};
  130. static unsigned int MMX_CONST Blenders__mmx_zero[2];
  131. static unsigned int MMX_CONST Blenders__mmx_one[2] = {1, 0};
  132. #undef MMX_CONST
  133. /// MMX
  134. // average blend of a and b.
  135. unsigned int inline Blenders::BLEND_AVG_MMX(unsigned int a, unsigned int b)
  136. {
  137. return ((a >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))) + ((b >> 1)&~((1 << 7) | (1 << 15) | (1 << 23)));
  138. }
  139. // multiplies 32 bit color A by scalar V (0-255)
  140. unsigned int inline Blenders::BLEND_MUL_MMX(unsigned int a, int v)
  141. {
  142. #ifdef WIN32
  143. __asm
  144. {
  145. movd mm3, [v] // VVVVVVVV
  146. movd mm0, [a]
  147. packuswb mm3, mm3 // 0000HHVV
  148. punpcklbw mm0, [Blenders__mmx_zero]
  149. punpcklwd mm3, mm3 // HHVVHHVV
  150. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  151. pmullw mm0, mm3
  152. psrlw mm0, 8
  153. packuswb mm0, mm0
  154. movd eax, mm0
  155. }
  156. #else
  157. __asm__ volatile (
  158. "movd %0, %%mm3\n"
  159. "movd %1, %%mm0\n"
  160. "packuswb %%mm3, %%mm3\n"
  161. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  162. "punpcklwd %%mm3, %%mm3\n"
  163. "punpckldq %%mm3, %%mm3\n"
  164. "pmullw %%mm3, %%mm0\n"
  165. "psrlw $8, %%mm0\n"
  166. "packuswb %%mm0, %%mm0\n"
  167. "movd %%mm0, %%eax\n"
  168. :
  169. : "m" (v), "m" (a)
  170. : "%mm0", "%mm3" );
  171. #endif
  172. }
  173. // V is scalar (0-255), (1.0-V)*b + V*a
  174. unsigned int inline Blenders::BLEND_ADJ1_MMX(unsigned int a, unsigned int b, int v)
  175. {
  176. #ifdef WIN32
  177. __asm
  178. {
  179. movd mm3, [v] // VVVVVVVV
  180. movd mm0, [a]
  181. packuswb mm3, mm3 // 0000HHVV
  182. movd mm1, [b]
  183. paddusw mm3, [Blenders__mmx_one]
  184. movq mm4, [Blenders__mmx_revn2]
  185. punpcklwd mm3, mm3 // HHVVHHVV
  186. punpcklbw mm0, [Blenders__mmx_zero]
  187. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  188. punpcklbw mm1, [Blenders__mmx_zero]
  189. psubw mm4, mm3
  190. pmullw mm0, mm3
  191. pmullw mm1, mm4
  192. paddw mm0, mm1
  193. psrlw mm0, 8
  194. packuswb mm0, mm0
  195. movd eax, mm0
  196. }
  197. #else
  198. __asm__ volatile (
  199. "movd %0, %%mm3\n"
  200. "movd %1, %%mm0\n"
  201. "packuswb %%mm3, %%mm3\n"
  202. "movd %2, %%mm1\n"
  203. "paddusw (Blenders__mmx_one), %%mm3\n"
  204. "movq (Blenders__mmx_revn2), %%mm4\n"
  205. "punpcklwd %%mm3, %%mm3\n"
  206. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  207. "punpckldq %%mm3, %%mm3\n"
  208. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  209. "psubw %%mm3, %%mm4\n"
  210. "pmullw %%mm3, %%mm0\n"
  211. "pmullw %%mm4, %%mm1\n"
  212. "paddw %%mm1, %%mm0\n"
  213. "psrlw $8, %%mm0\n"
  214. "packuswb %%mm0, %%mm0\n"
  215. "movd %%mm0, %%eax\n"
  216. :
  217. : "m" (v), "m" (a), "m" (b)
  218. : "%mm0", "%mm1", "%mm3", "%mm4" );
  219. #endif
  220. }
  221. // returns a*(1.0-Alpha(b)) + b
  222. unsigned int inline Blenders::BLEND_ADJ2_MMX(unsigned int a, unsigned int b)
  223. {
  224. #ifdef WIN32
  225. __asm
  226. {
  227. movd mm3, [b] // VVVVVVVV
  228. movq mm4, [Blenders__mmx_revn2]
  229. movd mm0, [a]
  230. psrld mm3, 24
  231. movd mm1, [b]
  232. paddusw mm3, [Blenders__mmx_one]
  233. punpcklwd mm3, mm3 // HHVVHHVV
  234. punpcklbw mm0, [Blenders__mmx_zero]
  235. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  236. punpcklbw mm1, [Blenders__mmx_zero]
  237. psubw mm4, mm3
  238. pmullw mm0, mm4
  239. // stall
  240. // stall
  241. // stall
  242. psrlw mm0, 8
  243. // stall
  244. paddw mm0, mm1
  245. // stall
  246. packuswb mm0, mm0
  247. // stall
  248. movd eax, mm0
  249. }
  250. #else
  251. __asm__ volatile (
  252. "movd %1, %%mm3\n"
  253. "movq (Blenders__mmx_revn2), %%mm4\n"
  254. "movd %0, %%mm0\n"
  255. "psrld $24, %%mm3\n"
  256. "movd %1, %%mm1\n"
  257. "paddusw (Blenders__mmx_one), %%mm3\n"
  258. "punpcklwd %%mm3, %%mm3\n"
  259. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  260. "punpckldq %%mm3, %%mm3\n"
  261. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  262. "psubw %%mm3, %%mm4\n"
  263. "pmullw %%mm4, %%mm0\n"
  264. "psrlw $8, %%mm0\n"
  265. "paddw %%mm1, %%mm0\n"
  266. "packuswb %%mm0, %%mm0\n"
  267. "movd %%mm0, %%eax\n"
  268. :
  269. : "m" (a), "m" (b)
  270. : "%esi", "%mm0", "%mm1", "%mm3", "%mm4" );
  271. #endif
  272. }
  273. // returns a*(1-Alpha(b)*W) + b*W, clamped (W is scalar 0-0xff).
  274. unsigned int inline Blenders::BLEND_ADJ3_MMX(unsigned int a, unsigned int b, int w)
  275. {
  276. #ifdef WIN32
  277. __asm
  278. {
  279. movd mm3, [b] // VVVVVVVV
  280. movd mm5, [w]
  281. movd mm0, [a]
  282. psrld mm3, 24
  283. movd mm1, [b]
  284. paddusw mm3, [Blenders__mmx_one]
  285. movq mm4, [Blenders__mmx_revn2]
  286. pmullw mm3, mm5
  287. packuswb mm5, mm5
  288. punpcklbw mm0, [Blenders__mmx_zero]
  289. punpcklwd mm5, mm5
  290. punpcklbw mm1, [Blenders__mmx_zero]
  291. psrlw mm3, 8
  292. punpckldq mm5, mm5
  293. paddusw mm3, [Blenders__mmx_one]
  294. punpcklwd mm3, mm3 // HHVVHHVV
  295. punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV
  296. psubw mm4, mm3
  297. pmullw mm0, mm4
  298. pmullw mm1, mm5
  299. paddusw mm0, mm1
  300. psrlw mm0, 8
  301. packuswb mm0, mm0
  302. movd eax, mm0
  303. }
  304. #else
  305. __asm__ volatile (
  306. "movd %2, %%mm3\n"
  307. "movd %0, %%mm5\n"
  308. "movd %1, %%mm0\n"
  309. "psrld $24, %%mm3\n"
  310. "movd %2, %%mm1\n"
  311. "paddusw (Blenders__mmx_one), %%mm3\n"
  312. "movq (Blenders__mmx_revn2), %%mm4\n"
  313. "pmullw %%mm5, %%mm3\n"
  314. "packuswb %%mm5, %%mm5 \n"
  315. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  316. "punpcklwd %%mm5, %%mm5\n"
  317. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  318. "psrlw $8, %%mm3\n"
  319. "punpckldq %%mm5, %%mm5\n"
  320. "paddusw (Blenders__mmx_one), %%mm3\n"
  321. "punpcklwd %%mm3, %%mm3\n"
  322. "punpckldq %%mm3, %%mm3\n"
  323. "psubw %%mm3, %%mm4\n"
  324. "pmullw %%mm4, %%mm0\n"
  325. "pmullw %%mm5, %%mm1\n"
  326. "paddusw %%mm1, %%mm0\n"
  327. "psrlw $8, %%mm0\n"
  328. "packuswb %%mm0, %%mm0\n"
  329. "movd %%mm0, %%eax\n"
  330. :
  331. : "m" (w), "m" (a), "m" (b)
  332. : "%mm0", "%mm1", "%mm4", "%mm3", "%mm5" );
  333. #endif
  334. }
  335. // does bilinear filtering. p1 is upper left pixel, w is width of framebuffer
  336. // xp and yp's low 16 bits are used for the subpixel positioning.
  337. unsigned int inline Blenders::BLEND4_MMX(unsigned int *p1, unsigned int w, int xp, int yp)
  338. {
  339. #ifdef WIN32
  340. __asm
  341. {
  342. movd mm6, xp
  343. mov eax, p1
  344. movd mm7, yp
  345. mov esi, w
  346. movq mm4, Blenders__mmx_revn2
  347. psrlw mm6, 8
  348. movq mm5, Blenders__mmx_revn2
  349. psrlw mm7, 8
  350. movd mm0, [eax]
  351. punpcklwd mm6, mm6
  352. movd mm1, [eax + 4]
  353. punpcklwd mm7, mm7
  354. movd mm2, [eax + esi*4]
  355. punpckldq mm6, mm6
  356. movd mm3, [eax + esi*4 + 4]
  357. punpckldq mm7, mm7
  358. punpcklbw mm0, [Blenders__mmx_zero]
  359. psubw mm4, mm6
  360. punpcklbw mm1, [Blenders__mmx_zero]
  361. pmullw mm0, mm4
  362. punpcklbw mm2, [Blenders__mmx_zero]
  363. pmullw mm1, mm6
  364. punpcklbw mm3, [Blenders__mmx_zero]
  365. psubw mm5, mm7
  366. pmullw mm2, mm4
  367. pmullw mm3, mm6
  368. paddw mm0, mm1
  369. // stall (mm0)
  370. psrlw mm0, 8
  371. // stall (waiting for mm3/mm2)
  372. paddw mm2, mm3
  373. pmullw mm0, mm5
  374. psrlw mm2, 8
  375. // stall (mm2)
  376. pmullw mm2, mm7
  377. // stall
  378. // stall (mm2)
  379. paddw mm0, mm2
  380. // stall
  381. psrlw mm0, 8
  382. // stall
  383. packuswb mm0, mm0
  384. // stall
  385. movd eax, mm0
  386. }
  387. #else
  388. __asm__ volatile (
  389. "movd %2, %%mm6\n"
  390. "mov %0, %%eax\n"
  391. "movd %3, %%mm7\n"
  392. "mov %1, %%esi\n"
  393. "movq (Blenders__mmx_revn2), %%mm4\n"
  394. "psrlw $8, %%mm6\n"
  395. "movq (Blenders__mmx_revn2), %%mm5\n"
  396. "psrlw $8, %%mm7\n"
  397. "movd (%%eax), %%mm0\n"
  398. "punpcklwd %%mm6,%%mm6\n"
  399. "movd 4(%%eax), %%mm1\n"
  400. "punpcklwd %%mm7,%%mm7\n"
  401. "movd (%%eax,%%esi,4), %%mm2\n"
  402. "punpckldq %%mm6,%%mm6\n"
  403. "movd 4(%%eax,%%esi,4), %%mm3\n"
  404. "punpckldq %%mm7,%%mm7\n"
  405. "punpcklbw (Blenders__mmx_zero), %%mm0\n"
  406. "psubw %%mm6, %%mm4\n"
  407. "punpcklbw (Blenders__mmx_zero), %%mm1\n"
  408. "pmullw %%mm4, %%mm0\n"
  409. "punpcklbw (Blenders__mmx_zero), %%mm2\n"
  410. "pmullw %%mm6, %%mm1\n"
  411. "punpcklbw (Blenders__mmx_zero), %%mm3\n"
  412. "psubw %%mm7, %%mm5\n"
  413. "pmullw %%mm4, %%mm2\n"
  414. "pmullw %%mm6, %%mm3\n"
  415. "paddw %%mm1, %%mm0\n"
  416. "psrlw $8, %%mm0\n"
  417. "paddw %%mm3, %%mm2\n"
  418. "pmullw %%mm5, %%mm0\n"
  419. "psrlw $8, %%mm2\n"
  420. "pmullw %%mm7, %%mm2\n"
  421. "paddw %%mm2, %%mm0\n"
  422. "psrlw $8, %%mm0\n"
  423. "packuswb %%mm0, %%mm0\n"
  424. "movd %%mm0, %%eax\n"
  425. :
  426. : "m" (p1), "m" (w), "m" (xp), "m" (yp)
  427. : "%mm0", "%mm1", "%mm4", "%mm3", "%mm5" );
  428. #endif
  429. }
  430. #ifdef WIN32
  431. #pragma warning( pop )
  432. #endif
  433. #endif // ndef NO_MMX
  434. #endif