scaleopt.c 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267
  1. /****************************************************************************
  2. *
  3. * Module Title : scaleopt.cpp
  4. *
  5. * Description : Optimized scaling functions
  6. *
  7. ****************************************************************************/
  8. /****************************************************************************
  9. * Module Statics
  10. ****************************************************************************/
  11. __declspec(align(16)) const static unsigned short oneFifth[] = { 51, 51, 51, 51 };
  12. __declspec(align(16)) const static unsigned short twoFifths[] = { 102, 102, 102, 102 };
  13. __declspec(align(16)) const static unsigned short threeFifths[] = { 154, 154, 154, 154 };
  14. __declspec(align(16)) const static unsigned short fourFifths[] = { 205, 205, 205, 205 };
  15. __declspec(align(16)) const static unsigned short roundValues[] = { 128, 128, 128, 128 };
  16. __declspec(align(16)) const static unsigned short fourOnes[]= { 1, 1, 1, 1};
  17. __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
  18. __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
  19. __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
  20. __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
  21. __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
  22. #if defined(__cplusplus)
  23. extern "C" {
  24. #endif
  25. /****************************************************************************
  26. *
  27. * ROUTINE : HorizontalLine_3_5_Scale_MMX
  28. *
  29. * INPUTS : const unsigned char *source :
  30. * unsigned int sourceWidth :
  31. * unsigned char *dest :
  32. * unsigned int destWidth :
  33. *
  34. * OUTPUTS : None.
  35. *
  36. * RETURNS : void
  37. *
  38. * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
  39. *
  40. * SPECIAL NOTES : None.
  41. *
  42. ****************************************************************************/
  43. void HorizontalLine_3_5_Scale_MMX
  44. (
  45. const unsigned char *source,
  46. unsigned int sourceWidth,
  47. unsigned char *dest,
  48. unsigned int destWidth
  49. )
  50. {
  51. (void) destWidth;
  52. __asm
  53. {
  54. push ebx
  55. mov esi, source
  56. mov edi, dest
  57. mov ecx, sourceWidth
  58. lea edx, [esi+ecx-3];
  59. movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
  60. movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
  61. movq mm4, roundValues // mm4 = 80 xx 80 xx 80 xx 80 xx
  62. pxor mm7, mm7 // clear mm7
  63. HorizLine_3_5_Loop:
  64. mov eax, DWORD PTR [esi] // eax = 00 01 02 03
  65. mov ebx, eax
  66. and ebx, 0xffff00 // ebx = xx 01 02 xx
  67. mov ecx, eax // ecx = 00 01 02 03
  68. and eax, 0xffff0000 // eax = xx xx 02 03
  69. xor ecx, eax // ecx = 00 01 xx xx
  70. shr ebx, 8 // ebx = 01 02 xx xx
  71. or eax, ebx // eax = 01 02 02 03
  72. shl ebx, 16 // ebx = xx xx 01 02
  73. movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
  74. or ebx, ecx // ebx = 00 01 01 02
  75. punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
  76. movd mm0, ebx // mm0 = 00 01 01 02
  77. pmullw mm1, mm6 //
  78. punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
  79. pmullw mm0, mm5 //
  80. mov [edi], ebx // writeoutput 00 xx xx xx
  81. add esi, 3
  82. add edi, 5
  83. paddw mm0, mm1
  84. paddw mm0, mm4
  85. psrlw mm0, 8
  86. cmp esi, edx
  87. packuswb mm0, mm7
  88. movd DWORD Ptr [edi-4], mm0
  89. jl HorizLine_3_5_Loop
  90. //Exit:
  91. mov eax, DWORD PTR [esi] // eax = 00 01 02 03
  92. mov ebx, eax
  93. and ebx, 0xffff00 // ebx = xx 01 02 xx
  94. mov ecx, eax // ecx = 00 01 02 03
  95. and eax, 0xffff0000 // eax = xx xx 02 03
  96. xor ecx, eax // ecx = 00 01 xx xx
  97. shr ebx, 8 // ebx = 01 02 xx xx
  98. or eax, ebx // eax = 01 02 02 03
  99. shl eax, 8 // eax = xx 01 02 02
  100. and eax, 0xffff0000 // eax = xx xx 02 02
  101. or eax, ebx // eax = 01 02 02 02
  102. shl ebx, 16 // ebx = xx xx 01 02
  103. movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
  104. or ebx, ecx // ebx = 00 01 01 02
  105. punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
  106. movd mm0, ebx // mm0 = 00 01 01 02
  107. pmullw mm1, mm6 //
  108. punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
  109. pmullw mm0, mm5 //
  110. mov [edi], ebx // writeoutput 00 xx xx xx
  111. paddw mm0, mm1
  112. paddw mm0, mm4
  113. psrlw mm0, 8
  114. packuswb mm0, mm7
  115. movd DWORD Ptr [edi+1], mm0
  116. pop ebx
  117. }
  118. /*
  119. const unsigned char *src = source;
  120. unsigned char *des = dest;
  121. unsigned int a, b, c ;
  122. unsigned int i;
  123. (void) destWidth;
  124. for ( i=0; i<sourceWidth-3; i+=3 )
  125. {
  126. a = src[0];
  127. b = src[1];
  128. des [0] = (UINT8) (a);
  129. // 2 * left + 3 * right /5
  130. des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
  131. c = src[2] ;
  132. // 4 * left + 1 * right /5
  133. des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
  134. // 1 * left + 4 * right /5
  135. des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
  136. a = src[3];
  137. // 3 * left + 2 * right /5
  138. des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);
  139. src += 3;
  140. des += 5;
  141. }
  142. a = src[0];
  143. b = src[1];
  144. des [0] = (UINT8) (a);
  145. // 2 * left + 3 * right /5
  146. des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
  147. c = src[2] ;
  148. // 4 * left + 1 * right /5
  149. des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
  150. // 1 * left + 4 * right /5
  151. des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
  152. des [4] = (UINT8) (c);
  153. */
  154. }
  155. /****************************************************************************
  156. *
  157. * ROUTINE : HorizontalLine_4_5_Scale_MMX
  158. *
  159. * INPUTS : const unsigned char *source :
  160. * unsigned int sourceWidth :
  161. * unsigned char *dest :
  162. * unsigned int destWidth :
  163. *
  164. * OUTPUTS : None.
  165. *
  166. * RETURNS : void
  167. *
  168. * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
  169. *
  170. * SPECIAL NOTES : None.
  171. *
  172. ****************************************************************************/
  173. void HorizontalLine_4_5_Scale_MMX
  174. (
  175. const unsigned char *source,
  176. unsigned int sourceWidth,
  177. unsigned char *dest,
  178. unsigned int destWidth
  179. )
  180. {
  181. (void)destWidth;
  182. __asm
  183. {
  184. mov esi, source
  185. mov edi, dest
  186. mov ecx, sourceWidth
  187. lea edx, [esi+ecx-8];
  188. movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
  189. movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
  190. movq mm4, roundValues // mm4 = 80 xx 80 xx 80 xx 80 xx
  191. pxor mm7, mm7 // clear mm7
  192. HorizLine_4_5_Loop:
  193. movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
  194. movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
  195. movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
  196. movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
  197. movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
  198. punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
  199. punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
  200. pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
  201. pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
  202. punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
  203. movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
  204. pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
  205. punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
  206. pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
  207. paddw mm0, mm1 // added round values
  208. paddw mm0, mm4
  209. psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
  210. packuswb mm0, mm7
  211. movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
  212. add edi, 10
  213. add esi, 8
  214. paddw mm2, mm3 //
  215. paddw mm2, mm4 // added round values
  216. cmp esi, edx
  217. psrlw mm2, 8
  218. packuswb mm2, mm7
  219. movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
  220. jl HorizLine_4_5_Loop
  221. //Exit:
  222. movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
  223. movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
  224. movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
  225. psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
  226. movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
  227. pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
  228. psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
  229. por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
  230. movq mm3, mm1
  231. movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
  232. punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
  233. punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
  234. pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
  235. pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
  236. punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
  237. movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
  238. pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
  239. punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
  240. pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
  241. paddw mm0, mm1 // added round values
  242. paddw mm0, mm4
  243. psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
  244. packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
  245. movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
  246. paddw mm2, mm3 //
  247. paddw mm2, mm4 // added round values
  248. psrlw mm2, 8
  249. packuswb mm2, mm7
  250. movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
  251. }
  252. /*
  253. const unsigned char *src = source;
  254. unsigned char *des = dest;
  255. unsigned int a, b, c ;
  256. unsigned i;
  257. (void) destWidth;
  258. for ( i=0; i<sourceWidth-4; i+=4 )
  259. {
  260. a = src[0];
  261. b = src[1];
  262. des [0] = (UINT8) a;
  263. des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
  264. c = src[2] * 154;
  265. a = src[3];
  266. des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
  267. des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
  268. b = src[4];
  269. des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);
  270. src += 4;
  271. des += 5;
  272. }
  273. a = src[0];
  274. b = src[1];
  275. des [0] = (UINT8) (a);
  276. des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
  277. c = src[2] * 154;
  278. a = src[3];
  279. des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
  280. des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
  281. des [4] = (UINT8) (a);
  282. */
  283. }
  284. /****************************************************************************
  285. *
  286. * ROUTINE : VerticalBand_4_5_Scale_MMX
  287. *
  288. * INPUTS : unsigned char *dest :
  289. * unsigned int destPitch :
  290. * unsigned int destWidth :
  291. *
  292. * OUTPUTS : None.
  293. *
  294. * RETURNS : void
  295. *
  296. * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
  297. *
  298. * SPECIAL NOTES : The routine uses the first line of the band below
  299. * the current band. The function also has a "C" only
  300. * version.
  301. *
  302. ****************************************************************************/
  303. void VerticalBand_4_5_Scale_MMX
  304. (
  305. unsigned char *dest,
  306. unsigned int destPitch,
  307. unsigned int destWidth
  308. )
  309. {
  310. __asm
  311. {
  312. mov esi, dest // Get the source and destination pointer
  313. mov ecx, destPitch // Get the pitch size
  314. lea edi, [esi+ecx*2] // tow lines below
  315. add edi, ecx // three lines below
  316. pxor mm7, mm7 // clear out mm7
  317. mov edx, destWidth // Loop counter
  318. VS_4_5_loop:
  319. movq mm0, QWORD ptr [esi] // src[0];
  320. movq mm1, QWORD ptr [esi+ecx] // src[1];
  321. movq mm2, mm0 // Make a copy
  322. punpcklbw mm0, mm7 // unpack low to word
  323. movq mm5, oneFifth
  324. punpckhbw mm2, mm7 // unpack high to word
  325. pmullw mm0, mm5 // a * 1/5
  326. movq mm3, mm1 // make a copy
  327. punpcklbw mm1, mm7 // unpack low to word
  328. pmullw mm2, mm5 // a * 1/5
  329. movq mm6, fourFifths // constan
  330. movq mm4, mm1 // copy of low b
  331. pmullw mm4, mm6 // b * 4/5
  332. punpckhbw mm3, mm7 // unpack high to word
  333. movq mm5, mm3 // copy of high b
  334. pmullw mm5, mm6 // b * 4/5
  335. paddw mm0, mm4 // a * 1/5 + b * 4/5
  336. paddw mm2, mm5 // a * 1/5 + b * 4/5
  337. paddw mm0, roundValues // + 128
  338. paddw mm2, roundValues // + 128
  339. psrlw mm0, 8
  340. psrlw mm2, 8
  341. packuswb mm0, mm2 // des [1]
  342. movq QWORD ptr [esi+ecx], mm0 // write des[1]
  343. movq mm0, [esi+ecx*2] // mm0 = src[2]
  344. // mm1, mm3 --- Src[1]
  345. // mm0 --- Src[2]
  346. // mm7 for unpacking
  347. movq mm5, twoFifths
  348. movq mm2, mm0 // make a copy
  349. pmullw mm1, mm5 // b * 2/5
  350. movq mm6, threeFifths
  351. punpcklbw mm0, mm7 // unpack low to word
  352. pmullw mm3, mm5 // b * 2/5
  353. movq mm4, mm0 // make copy of c
  354. punpckhbw mm2, mm7 // unpack high to word
  355. pmullw mm4, mm6 // c * 3/5
  356. movq mm5, mm2
  357. pmullw mm5, mm6 // c * 3/5
  358. paddw mm1, mm4 // b * 2/5 + c * 3/5
  359. paddw mm3, mm5 // b * 2/5 + c * 3/5
  360. paddw mm1, roundValues // + 128
  361. paddw mm3, roundValues // + 128
  362. psrlw mm1, 8
  363. psrlw mm3, 8
  364. packuswb mm1, mm3 // des[2]
  365. movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
  366. movq mm1, [edi] // mm1=Src[3];
  367. // mm0, mm2 --- Src[2]
  368. // mm1 --- Src[3]
  369. // mm6 --- 3/5
  370. // mm7 for unpacking
  371. pmullw mm0, mm6 // c * 3/5
  372. movq mm5, twoFifths // mm5 = 2/5
  373. movq mm3, mm1 // make a copy
  374. pmullw mm2, mm6 // c * 3/5
  375. punpcklbw mm1, mm7 // unpack low
  376. movq mm4, mm1 // make a copy
  377. punpckhbw mm3, mm7 // unpack high
  378. pmullw mm4, mm5 // d * 2/5
  379. movq mm6, mm3 // make a copy
  380. pmullw mm6, mm5 // d * 2/5
  381. paddw mm0, mm4 // c * 3/5 + d * 2/5
  382. paddw mm2, mm6 // c * 3/5 + d * 2/5
  383. paddw mm0, roundValues // + 128
  384. paddw mm2, roundValues // + 128
  385. psrlw mm0, 8
  386. psrlw mm2, 8
  387. packuswb mm0, mm2 // des[3]
  388. movq QWORD ptr [edi], mm0 // write des[3]
  389. // mm1, mm3 --- Src[3]
  390. // mm7 -- cleared for unpacking
  391. movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
  392. movq mm5, fourFifths // mm5 = 4/5
  393. pmullw mm1, mm5 // d * 4/5
  394. movq mm6, oneFifth // mm6 = 1/5
  395. movq mm2, mm0 // make a copy
  396. pmullw mm3, mm5 // d * 4/5
  397. punpcklbw mm0, mm7 // unpack low
  398. pmullw mm0, mm6 // an * 1/5
  399. punpckhbw mm2, mm7 // unpack high
  400. paddw mm1, mm0 // d * 4/5 + an * 1/5
  401. pmullw mm2, mm6 // an * 1/5
  402. paddw mm3, mm2 // d * 4/5 + an * 1/5
  403. paddw mm1, roundValues // + 128
  404. paddw mm3, roundValues // + 128
  405. psrlw mm1, 8
  406. psrlw mm3, 8
  407. packuswb mm1, mm3 // des[4]
  408. movq QWORD ptr [edi+ecx], mm1 // write des[4]
  409. add edi, 8
  410. add esi, 8
  411. sub edx, 8
  412. jg VS_4_5_loop
  413. }
  414. }
  415. /****************************************************************************
  416. *
  417. * ROUTINE : LastVerticalBand_4_5_Scale_MMX
  418. *
  419. * INPUTS : unsigned char *dest :
  420. * unsigned int destPitch :
  421. * unsigned int destWidth :
  422. *
  423. * OUTPUTS : None.
  424. *
  425. * RETURNS : None
  426. *
  427. * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
  428. *
  429. * SPECIAL NOTES : The routine uses the first line of the band below
  430. * the current band. The function also has an "C" only
  431. * version.
  432. *
  433. ****************************************************************************/
  434. void LastVerticalBand_4_5_Scale_MMX
  435. (
  436. unsigned char *dest,
  437. unsigned int destPitch,
  438. unsigned int destWidth
  439. )
  440. {
  441. __asm
  442. {
  443. mov esi, dest // Get the source and destination pointer
  444. mov ecx, destPitch // Get the pitch size
  445. lea edi, [esi+ecx*2] // tow lines below
  446. add edi, ecx // three lines below
  447. pxor mm7, mm7 // clear out mm7
  448. mov edx, destWidth // Loop counter
  449. LastVS_4_5_loop:
  450. movq mm0, QWORD ptr [esi] // src[0];
  451. movq mm1, QWORD ptr [esi+ecx] // src[1];
  452. movq mm2, mm0 // Make a copy
  453. punpcklbw mm0, mm7 // unpack low to word
  454. movq mm5, oneFifth
  455. punpckhbw mm2, mm7 // unpack high to word
  456. pmullw mm0, mm5 // a * 1/5
  457. movq mm3, mm1 // make a copy
  458. punpcklbw mm1, mm7 // unpack low to word
  459. pmullw mm2, mm5 // a * 1/5
  460. movq mm6, fourFifths // constan
  461. movq mm4, mm1 // copy of low b
  462. pmullw mm4, mm6 // b * 4/5
  463. punpckhbw mm3, mm7 // unpack high to word
  464. movq mm5, mm3 // copy of high b
  465. pmullw mm5, mm6 // b * 4/5
  466. paddw mm0, mm4 // a * 1/5 + b * 4/5
  467. paddw mm2, mm5 // a * 1/5 + b * 4/5
  468. paddw mm0, roundValues // + 128
  469. paddw mm2, roundValues // + 128
  470. psrlw mm0, 8
  471. psrlw mm2, 8
  472. packuswb mm0, mm2 // des [1]
  473. movq QWORD ptr [esi+ecx], mm0 // write des[1]
  474. movq mm0, [esi+ecx*2] // mm0 = src[2]
  475. // mm1, mm3 --- Src[1]
  476. // mm0 --- Src[2]
  477. // mm7 for unpacking
  478. movq mm5, twoFifths
  479. movq mm2, mm0 // make a copy
  480. pmullw mm1, mm5 // b * 2/5
  481. movq mm6, threeFifths
  482. punpcklbw mm0, mm7 // unpack low to word
  483. pmullw mm3, mm5 // b * 2/5
  484. movq mm4, mm0 // make copy of c
  485. punpckhbw mm2, mm7 // unpack high to word
  486. pmullw mm4, mm6 // c * 3/5
  487. movq mm5, mm2
  488. pmullw mm5, mm6 // c * 3/5
  489. paddw mm1, mm4 // b * 2/5 + c * 3/5
  490. paddw mm3, mm5 // b * 2/5 + c * 3/5
  491. paddw mm1, roundValues // + 128
  492. paddw mm3, roundValues // + 128
  493. psrlw mm1, 8
  494. psrlw mm3, 8
  495. packuswb mm1, mm3 // des[2]
  496. movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
  497. movq mm1, [edi] // mm1=Src[3];
  498. movq QWORD ptr [edi+ecx], mm1 // write des[4];
  499. // mm0, mm2 --- Src[2]
  500. // mm1 --- Src[3]
  501. // mm6 --- 3/5
  502. // mm7 for unpacking
  503. pmullw mm0, mm6 // c * 3/5
  504. movq mm5, twoFifths // mm5 = 2/5
  505. movq mm3, mm1 // make a copy
  506. pmullw mm2, mm6 // c * 3/5
  507. punpcklbw mm1, mm7 // unpack low
  508. movq mm4, mm1 // make a copy
  509. punpckhbw mm3, mm7 // unpack high
  510. pmullw mm4, mm5 // d * 2/5
  511. movq mm6, mm3 // make a copy
  512. pmullw mm6, mm5 // d * 2/5
  513. paddw mm0, mm4 // c * 3/5 + d * 2/5
  514. paddw mm2, mm6 // c * 3/5 + d * 2/5
  515. paddw mm0, roundValues // + 128
  516. paddw mm2, roundValues // + 128
  517. psrlw mm0, 8
  518. psrlw mm2, 8
  519. packuswb mm0, mm2 // des[3]
  520. movq QWORD ptr [edi], mm0 // write des[3]
  521. // mm1, mm3 --- Src[3]
  522. // mm7 -- cleared for unpacking
  523. add edi, 8
  524. add esi, 8
  525. sub edx, 8
  526. jg LastVS_4_5_loop
  527. }
  528. }
  529. /****************************************************************************
  530. *
  531. * ROUTINE : VerticalBand_3_5_Scale_MMX
  532. *
  533. * INPUTS : unsigned char *dest :
  534. * unsigned int destPitch :
  535. * unsigned int destWidth :
  536. *
  537. * OUTPUTS : None.
  538. *
  539. * RETURNS : void
  540. *
  541. * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
  542. *
  543. * SPECIAL NOTES : The routine uses the first line of the band below
  544. * the current band. The function also has an "C" only
  545. * version.
  546. *
  547. ****************************************************************************/
  548. void VerticalBand_3_5_Scale_MMX
  549. (
  550. unsigned char *dest,
  551. unsigned int destPitch,
  552. unsigned int destWidth
  553. )
  554. {
  555. __asm
  556. {
  557. mov esi, dest // Get the source and destination pointer
  558. mov ecx, destPitch // Get the pitch size
  559. lea edi, [esi+ecx*2] // tow lines below
  560. add edi, ecx // three lines below
  561. pxor mm7, mm7 // clear out mm7
  562. mov edx, destWidth // Loop counter
  563. VS_3_5_loop:
  564. movq mm0, QWORD ptr [esi] // src[0];
  565. movq mm1, QWORD ptr [esi+ecx] // src[1];
  566. movq mm2, mm0 // Make a copy
  567. punpcklbw mm0, mm7 // unpack low to word
  568. movq mm5, twoFifths // mm5 = 2/5
  569. punpckhbw mm2, mm7 // unpack high to word
  570. pmullw mm0, mm5 // a * 2/5
  571. movq mm3, mm1 // make a copy
  572. punpcklbw mm1, mm7 // unpack low to word
  573. pmullw mm2, mm5 // a * 2/5
  574. movq mm6, threeFifths // mm6 = 3/5
  575. movq mm4, mm1 // copy of low b
  576. pmullw mm4, mm6 // b * 3/5
  577. punpckhbw mm3, mm7 // unpack high to word
  578. movq mm5, mm3 // copy of high b
  579. pmullw mm5, mm6 // b * 3/5
  580. paddw mm0, mm4 // a * 2/5 + b * 3/5
  581. paddw mm2, mm5 // a * 2/5 + b * 3/5
  582. paddw mm0, roundValues // + 128
  583. paddw mm2, roundValues // + 128
  584. psrlw mm0, 8
  585. psrlw mm2, 8
  586. packuswb mm0, mm2 // des [1]
  587. movq QWORD ptr [esi+ecx], mm0 // write des[1]
  588. movq mm0, [esi+ecx*2] // mm0 = src[2]
  589. // mm1, mm3 --- Src[1]
  590. // mm0 --- Src[2]
  591. // mm7 for unpacking
  592. movq mm4, mm1 // b low
  593. pmullw mm1, fourFifths // b * 4/5 low
  594. movq mm5, mm3 // b high
  595. pmullw mm3, fourFifths // b * 4/5 high
  596. movq mm2, mm0 // c
  597. pmullw mm4, oneFifth // b * 1/5
  598. punpcklbw mm0, mm7 // c low
  599. pmullw mm5, oneFifth // b * 1/5
  600. movq mm6, mm0 // make copy of c low
  601. punpckhbw mm2, mm7 // c high
  602. pmullw mm6, oneFifth // c * 1/5 low
  603. movq mm7, mm2 // make copy of c high
  604. pmullw mm7, oneFifth // c * 1/5 high
  605. paddw mm1, mm6 // b * 4/5 + c * 1/5 low
  606. paddw mm3, mm7 // b * 4/5 + c * 1/5 high
  607. movq mm6, mm0 // make copy of c low
  608. pmullw mm6, fourFifths // c * 4/5 low
  609. movq mm7, mm2 // make copy of c high
  610. pmullw mm7, fourFifths // c * 4/5 high
  611. paddw mm4, mm6 // b * 1/5 + c * 4/5 low
  612. paddw mm5, mm7 // b * 1/5 + c * 4/5 high
  613. paddw mm1, roundValues // + 128
  614. paddw mm3, roundValues // + 128
  615. psrlw mm1, 8
  616. psrlw mm3, 8
  617. packuswb mm1, mm3 // des[2]
  618. movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
  619. paddw mm4, roundValues // + 128
  620. paddw mm5, roundValues // + 128
  621. psrlw mm4, 8
  622. psrlw mm5, 8
  623. packuswb mm4, mm5 // des[3]
  624. movq QWORD ptr [edi], mm4 // write des[3]
  625. // mm0, mm2 --- Src[3]
  626. pxor mm7, mm7 // clear mm7 for unpacking
  627. movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group
  628. movq mm5, threeFifths // mm5 = 3/5
  629. pmullw mm0, mm5 // d * 3/5
  630. movq mm6, twoFifths // mm6 = 2/5
  631. movq mm3, mm1 // make a copy
  632. pmullw mm2, mm5 // d * 3/5
  633. punpcklbw mm1, mm7 // unpack low
  634. pmullw mm1, mm6 // an * 2/5
  635. punpckhbw mm3, mm7 // unpack high
  636. paddw mm0, mm1 // d * 3/5 + an * 2/5
  637. pmullw mm3, mm6 // an * 2/5
  638. paddw mm2, mm3 // d * 3/5 + an * 2/5
  639. paddw mm0, roundValues // + 128
  640. paddw mm2, roundValues // + 128
  641. psrlw mm0, 8
  642. psrlw mm2, 8
  643. packuswb mm0, mm2 // des[4]
  644. movq QWORD ptr [edi+ecx], mm0 // write des[4]
  645. add edi, 8
  646. add esi, 8
  647. sub edx, 8
  648. jg VS_3_5_loop
  649. }
  650. }
  651. /****************************************************************************
  652. *
  653. * ROUTINE : LastVerticalBand_3_5_Scale_MMX
  654. *
  655. * INPUTS : unsigned char *dest :
  656. * unsigned int destPitch :
  657. * unsigned int destWidth :
  658. *
  659. * OUTPUTS : None.
  660. *
  661. * RETURNS : void
  662. *
  663. * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
  664. *
  665. * SPECIAL NOTES : The routine uses the first line of the band below
  666. * the current band. The function also has an "C" only
  667. * version.
  668. *
  669. ****************************************************************************/
  670. void LastVerticalBand_3_5_Scale_MMX
  671. (
  672. unsigned char *dest,
  673. unsigned int destPitch,
  674. unsigned int destWidth
  675. )
  676. {
  677. __asm
  678. {
  679. mov esi, dest // Get the source and destination pointer
  680. mov ecx, destPitch // Get the pitch size
  681. lea edi, [esi+ecx*2] // tow lines below
  682. add edi, ecx // three lines below
  683. pxor mm7, mm7 // clear out mm7
  684. mov edx, destWidth // Loop counter
  685. LastVS_3_5_loop:
  686. movq mm0, QWORD ptr [esi] // src[0];
  687. movq mm1, QWORD ptr [esi+ecx] // src[1];
  688. movq mm2, mm0 // Make a copy
  689. punpcklbw mm0, mm7 // unpack low to word
  690. movq mm5, twoFifths // mm5 = 2/5
  691. punpckhbw mm2, mm7 // unpack high to word
  692. pmullw mm0, mm5 // a * 2/5
  693. movq mm3, mm1 // make a copy
  694. punpcklbw mm1, mm7 // unpack low to word
  695. pmullw mm2, mm5 // a * 2/5
  696. movq mm6, threeFifths // mm6 = 3/5
  697. movq mm4, mm1 // copy of low b
  698. pmullw mm4, mm6 // b * 3/5
  699. punpckhbw mm3, mm7 // unpack high to word
  700. movq mm5, mm3 // copy of high b
  701. pmullw mm5, mm6 // b * 3/5
  702. paddw mm0, mm4 // a * 2/5 + b * 3/5
  703. paddw mm2, mm5 // a * 2/5 + b * 3/5
  704. paddw mm0, roundValues // + 128
  705. paddw mm2, roundValues // + 128
  706. psrlw mm0, 8
  707. psrlw mm2, 8
  708. packuswb mm0, mm2 // des [1]
  709. movq QWORD ptr [esi+ecx], mm0 // write des[1]
  710. movq mm0, [esi+ecx*2] // mm0 = src[2]
  711. // mm1, mm3 --- Src[1]
  712. // mm0 --- Src[2]
  713. // mm7 for unpacking
  714. movq mm4, mm1 // b low
  715. pmullw mm1, fourFifths // b * 4/5 low
  716. movq QWORD ptr [edi+ecx], mm0 // write des[4]
  717. movq mm5, mm3 // b high
  718. pmullw mm3, fourFifths // b * 4/5 high
  719. movq mm2, mm0 // c
  720. pmullw mm4, oneFifth // b * 1/5
  721. punpcklbw mm0, mm7 // c low
  722. pmullw mm5, oneFifth // b * 1/5
  723. movq mm6, mm0 // make copy of c low
  724. punpckhbw mm2, mm7 // c high
  725. pmullw mm6, oneFifth // c * 1/5 low
  726. movq mm7, mm2 // make copy of c high
  727. pmullw mm7, oneFifth // c * 1/5 high
  728. paddw mm1, mm6 // b * 4/5 + c * 1/5 low
  729. paddw mm3, mm7 // b * 4/5 + c * 1/5 high
  730. movq mm6, mm0 // make copy of c low
  731. pmullw mm6, fourFifths // c * 4/5 low
  732. movq mm7, mm2 // make copy of c high
  733. pmullw mm7, fourFifths // c * 4/5 high
  734. paddw mm4, mm6 // b * 1/5 + c * 4/5 low
  735. paddw mm5, mm7 // b * 1/5 + c * 4/5 high
  736. paddw mm1, roundValues // + 128
  737. paddw mm3, roundValues // + 128
  738. psrlw mm1, 8
  739. psrlw mm3, 8
  740. packuswb mm1, mm3 // des[2]
  741. movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
  742. paddw mm4, roundValues // + 128
  743. paddw mm5, roundValues // + 128
  744. psrlw mm4, 8
  745. psrlw mm5, 8
  746. packuswb mm4, mm5 // des[3]
  747. movq QWORD ptr [edi], mm4 // write des[3]
  748. // mm0, mm2 --- Src[3]
  749. add edi, 8
  750. add esi, 8
  751. sub edx, 8
  752. jg LastVS_3_5_loop
  753. }
  754. }
  755. /****************************************************************************
  756. *
  757. * ROUTINE : VerticalBand_1_2_Scale_MMX
  758. *
  759. * INPUTS : unsigned char *dest :
  760. * unsigned int destPitch :
  761. * unsigned int destWidth :
  762. *
  763. * OUTPUTS : None.
  764. *
  765. * RETURNS : void
  766. *
  767. * FUNCTION : 1 to 2 up-scaling of a band of pixels.
  768. *
  769. * SPECIAL NOTES : The routine uses the first line of the band below
  770. * the current band. The function also has an "C" only
  771. * version.
  772. *
  773. ****************************************************************************/
  774. void VerticalBand_1_2_Scale_MMX
  775. (
  776. unsigned char *dest,
  777. unsigned int destPitch,
  778. unsigned int destWidth
  779. )
  780. {
  781. __asm
  782. {
  783. mov esi, dest // Get the source and destination pointer
  784. mov ecx, destPitch // Get the pitch size
  785. pxor mm7, mm7 // clear out mm7
  786. mov edx, destWidth // Loop counter
  787. VS_1_2_loop:
  788. movq mm0, [esi] // get Src[0]
  789. movq mm1, [esi + ecx * 2] // get Src[1]
  790. movq mm2, mm0 // make copy before unpack
  791. movq mm3, mm1 // make copy before unpack
  792. punpcklbw mm0, mm7 // low Src[0]
  793. movq mm6, fourOnes // mm6= 1, 1, 1, 1
  794. punpcklbw mm1, mm7 // low Src[1]
  795. paddw mm0, mm1 // low (a + b)
  796. punpckhbw mm2, mm7 // high Src[0]
  797. paddw mm0, mm6 // low (a + b + 1)
  798. punpckhbw mm3, mm7
  799. paddw mm2, mm3 // high (a + b )
  800. psraw mm0, 1 // low (a + b +1 )/2
  801. paddw mm2, mm6 // high (a + b + 1)
  802. psraw mm2, 1 // high (a + b + 1)/2
  803. packuswb mm0, mm2 // pack results
  804. movq [esi+ecx], mm0 // write out eight bytes
  805. add esi, 8
  806. sub edx, 8
  807. jg VS_1_2_loop
  808. }
  809. }
  810. /****************************************************************************
  811. *
  812. * ROUTINE : LastVerticalBand_1_2_Scale_MMX
  813. *
  814. * INPUTS : unsigned char *dest :
  815. * unsigned int destPitch :
  816. * unsigned int destWidth :
  817. *
  818. * OUTPUTS : None.
  819. *
  820. * RETURNS : void
  821. *
  822. * FUNCTION : 1 to 2 up-scaling of band of pixels.
  823. *
  824. * SPECIAL NOTES : The routine uses the first line of the band below
  825. * the current band. The function also has an "C" only
  826. * version.
  827. *
  828. ****************************************************************************/
  829. void LastVerticalBand_1_2_Scale_MMX
  830. (
  831. unsigned char *dest,
  832. unsigned int destPitch,
  833. unsigned int destWidth
  834. )
  835. {
  836. __asm
  837. {
  838. mov esi, dest // Get the source and destination pointer
  839. mov ecx, destPitch // Get the pitch size
  840. mov edx, destWidth // Loop counter
  841. LastVS_1_2_loop:
  842. movq mm0, [esi] // get Src[0]
  843. movq [esi+ecx], mm0 // write out eight bytes
  844. add esi, 8
  845. sub edx, 8
  846. jg LastVS_1_2_loop
  847. }
  848. }
  849. /****************************************************************************
  850. *
  851. * ROUTINE : HorizontalLine_1_2_Scale
  852. *
  853. * INPUTS : const unsigned char *source :
  854. * unsigned int sourceWidth :
  855. * unsigned char *dest :
  856. * unsigned int destWidth :
  857. *
  858. * OUTPUTS : None.
  859. *
  860. * RETURNS : void
  861. *
  862. * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
  863. *
  864. * SPECIAL NOTES : None.
  865. *
  866. ****************************************************************************/
  867. void HorizontalLine_1_2_Scale_MMX
  868. (
  869. const unsigned char *source,
  870. unsigned int sourceWidth,
  871. unsigned char *dest,
  872. unsigned int destWidth
  873. )
  874. {
  875. (void) destWidth;
  876. __asm
  877. {
  878. mov esi, source
  879. mov edi, dest
  880. pxor mm7, mm7
  881. movq mm6, fourOnes
  882. mov ecx, sourceWidth
  883. HS_1_2_Loop:
  884. movq mm0, [esi]
  885. movq mm1, [esi+1]
  886. movq mm2, mm0
  887. movq mm3, mm1
  888. movq mm4, mm0
  889. punpcklbw mm0, mm7
  890. punpcklbw mm1, mm7
  891. paddw mm0, mm1
  892. paddw mm0, mm6
  893. punpckhbw mm2, mm7
  894. punpckhbw mm3, mm7
  895. paddw mm2, mm3
  896. paddw mm2, mm6
  897. psraw mm0, 1
  898. psraw mm2, 1
  899. packuswb mm0, mm2
  900. movq mm2, mm4
  901. punpcklbw mm2, mm0
  902. movq [edi], mm2
  903. punpckhbw mm4, mm0
  904. movq [edi+8], mm4
  905. add esi, 8
  906. add edi, 16
  907. sub ecx, 8
  908. cmp ecx, 8
  909. jg HS_1_2_Loop
  910. // last eight pixel
  911. movq mm0, [esi]
  912. movq mm1, mm0
  913. movq mm2, mm0
  914. movq mm3, mm1
  915. psrlq mm1, 8
  916. psrlq mm3, 56
  917. psllq mm3, 56
  918. por mm1, mm3
  919. movq mm3, mm1
  920. movq mm4, mm0
  921. punpcklbw mm0, mm7
  922. punpcklbw mm1, mm7
  923. paddw mm0, mm1
  924. paddw mm0, mm6
  925. punpckhbw mm2, mm7
  926. punpckhbw mm3, mm7
  927. paddw mm2, mm3
  928. paddw mm2, mm6
  929. psraw mm0, 1
  930. psraw mm2, 1
  931. packuswb mm0, mm2
  932. movq mm2, mm4
  933. punpcklbw mm2, mm0
  934. movq [edi], mm2
  935. punpckhbw mm4, mm0
  936. movq [edi+8], mm4
  937. }
  938. }
  939. #if defined(__cplusplus)
  940. extern "C" {
  941. #endif