deringopt.c 71 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529
  1. /****************************************************************************
  2. *
  3. * Module Title : DeRingingOpt.c
  4. *
  5. * Description : Optimized functions for PostProcessor
  6. *
  7. ***************************************************************************/
  8. #define STRICT /* Strict type checking */
  9. /****************************************************************************
  10. * Header Files
  11. ****************************************************************************/
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include "postp.h"
  15. /****************************************************************************
  16. * Macros
  17. ****************************************************************************/
  18. #pragma warning(disable:4799)
  19. #pragma warning(disable:4731)
  20. #pragma warning(disable:4305)
  21. /****************************************************************************
  22. * Module constants.
  23. ****************************************************************************/
  24. #if defined(_WIN32_WCE)
  25. #pragma pack(16)
  26. static unsigned short FourOnes[] = { 1, 1, 1, 1 };
  27. static unsigned short Four128s[] = { 128, 128, 128, 128 };
  28. static unsigned short Four64s[] = { 64, 64, 64, 64};
  29. static char eight64s [] = { 64, 64, 64, 64, 64, 64, 64, 64 };
  30. static char eight32s [] = { 32, 32, 32, 32, 32, 32, 32, 32 };
  31. static char eight127s []= { 127, 127, 127, 127, 127, 127, 127, 127 };
  32. static char eight128s []= { 128, 128, 128, 128, 128, 128, 128, 128 };
  33. static unsigned char eight223s[] = { 223, 223, 223, 223, 223, 223, 223, 223 };
  34. static unsigned char eight231s[] = { 231, 231, 231, 231, 231, 231, 231, 231 };
  35. #pragma pack()
  36. #else
  37. __declspec(align(16)) static unsigned short FourOnes[] = { 1, 1, 1, 1 };
  38. __declspec(align(16)) static unsigned short Four128s[] = { 128, 128, 128, 128 };
  39. __declspec(align(16)) static unsigned short Four64s[] = { 64, 64, 64, 64};
  40. __declspec(align(16)) static char eight64s [] = { 64, 64, 64, 64, 64, 64, 64, 64 };
  41. __declspec(align(16)) static char eight32s [] = { 32, 32, 32, 32, 32, 32, 32, 32 };
  42. __declspec(align(16)) static char eight127s []= { 127, 127, 127, 127, 127, 127, 127, 127 };
  43. __declspec(align(16)) static char eight128s []= { 128, 128, 128, 128, 128, 128, 128, 128 };
  44. __declspec(align(16)) static unsigned char eight223s[] = { 223, 223, 223, 223, 223, 223, 223, 223 };
  45. __declspec(align(16)) static unsigned char eight231s[] = { 231, 231, 231, 231, 231, 231, 231, 231 };
  46. #endif
  47. /****************************************************************************
  48. * Imports
  49. ****************************************************************************/
  50. extern UINT32 SharpenModifier[];
  51. /****************************************************************************
  52. *
  53. * ROUTINE : DeRingBlockStrong_MMX
  54. *
  55. * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance.
  56. * const UINT8 *SrcPtr : Pointer to input image.
  57. * UINT8 *DstPtr : Pointer to output image.
  58. * const INT32 Pitch : Image stride.
  59. * UINT32 FragQIndex : Q-index block encoded with.
  60. * UINT32 *QuantScale : Array of quantization scale factors.
  61. *
  62. * OUTPUTS : None.
  63. *
  64. * RETURNS : void
  65. *
  66. * FUNCTION : Filtering a block for de-ringing purpose.
  67. *
  68. * SPECIAL NOTES : None.
  69. *
  70. ****************************************************************************/
  71. void DeringBlockStrong_MMX
  72. (
  73. const POSTPROC_INSTANCE *pbi,
  74. const UINT8 *SrcPtr,
  75. UINT8 *DstPtr,
  76. const INT32 Pitch,
  77. UINT32 FragQIndex,
  78. UINT32 *QuantScale
  79. )
  80. {
  81. #if defined(_WIN32_WCE)
  82. #pragma pack(16)
  83. short UDMod[72];
  84. short LRMod[128];
  85. #pragma pack()
  86. #else
  87. __declspec(align(16)) short UDMod[72];
  88. __declspec(align(16)) short LRMod[128];
  89. #endif
  90. unsigned int PlaneLineStep = Pitch;
  91. const unsigned char *Src = SrcPtr;
  92. unsigned char *Des = DstPtr;
  93. short *UDPointer = UDMod;
  94. short *LRPointer = LRMod;
  95. UINT32 QStep = QuantScale[FragQIndex];
  96. INT32 Sharpen = SharpenModifier[FragQIndex];
  97. (void) pbi;
  98. __asm
  99. {
  100. push esi
  101. push edi
  102. mov esi, Src /* Source Pointer */
  103. mov edi, UDPointer /* UD modifier pointer */
  104. push ecx
  105. push edx
  106. mov ecx, PlaneLineStep /* Pitch Step */
  107. xor edx, edx
  108. push eax
  109. push ebx
  110. mov eax, QStep /* QValue */
  111. mov ebx, Sharpen /* Sharpen */
  112. movd mm0, eax /* QValue */
  113. movd mm2, ebx /* sharpen */
  114. punpcklbw mm0, mm0 /* 00 00 00 QQ */
  115. sub edx, ecx /* Negative Pitch */
  116. punpcklbw mm2, mm2 /* 00 00 00 SS */
  117. pxor mm7, mm7 /* clear mm7 for unpacks */
  118. punpcklbw mm0, mm0 /* 00 00 qq qq */
  119. mov eax, LRPointer /* Left and Right Modifier */
  120. punpcklbw mm2, mm2 /* 00 00 ss ss */
  121. lea ebx, [esi+ecx*8] /* Source Pointer of last row */
  122. punpcklbw mm0, mm0 /* qq qq qq qq */
  123. movq mm1, mm0; /* make a copy */
  124. punpcklbw mm2, mm2 /* ss ss ss ss */
  125. paddb mm1, mm0 /* QValue * 2 */
  126. paddb mm1, mm0 /* High = 3 * Qvalue */
  127. paddusb mm1, eight223s /* clamping high to 32 */
  128. paddb mm0, eight32s /* 32+QValues */
  129. psubusb mm1, eight223s /* Get the real value back */
  130. movq mm3, eight127s /* 7f 7f 7f 7f 7f 7f 7f 7f */
  131. pandn mm1, mm3 /* ClampHigh */
  132. /* mm0,mm1,mm2,mm7 are in use */
  133. /* mm0---> QValue+32 */
  134. /* mm1---> ClampHigh */
  135. /* mm2---> Sharpen */
  136. /* mm7---> Cleared for unpack */
  137. FillModLoop1:
  138. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  139. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  140. movq mm5, mm3 /* make a copy of p */
  141. psubusb mm3, mm4 /* p-pu */
  142. psubusb mm4, mm5 /* pu-p */
  143. por mm3, mm4 /* abs(p-pu) */
  144. movq mm6, mm0 /* 32+QValues */
  145. movq mm4, mm0 /* 32+QValues */
  146. psubusb mm6, mm3 /* zero clampled TmpMod */
  147. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  148. paddb mm4, eight64s /* 32+QValues + 64 */
  149. pxor mm4, mm5 /* convert to a sign number */
  150. pxor mm3, mm5 /* convert to a sign number */
  151. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  152. pand mm3, mm2 /* use sharpen */
  153. paddsb mm6, mm1 /* clamping to high */
  154. psubsb mm6, mm1 /* offset back */
  155. por mm6, mm3 /* Mod value to be stored */
  156. pxor mm5, mm5 /* clear mm5 */
  157. pxor mm4, mm4 /* clear mm4 */
  158. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  159. psraw mm5, 8 /* sign extended */
  160. movq QWORD PTR [edi], mm5 /* writeout UDmod, low four */
  161. punpckhbw mm4, mm6
  162. psraw mm4, 8
  163. movq QWORD PTR [edi+8], mm4 /* writeout UDmod, high four */
  164. /* left Mod */
  165. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  166. movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */
  167. movq mm5, mm3 /* make a copy of p */
  168. psubusb mm3, mm4 /* p-pu */
  169. psubusb mm4, mm5 /* pu-p */
  170. por mm3, mm4 /* abs(p-pu) */
  171. movq mm6, mm0 /* 32+QValues */
  172. movq mm4, mm0 /* 32+QValues */
  173. psubusb mm6, mm3 /* zero clampled TmpMod */
  174. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  175. paddb mm4, eight64s /* 32+QValues + 64 */
  176. pxor mm4, mm5 /* convert to a sign number */
  177. pxor mm3, mm5 /* convert to a sign number */
  178. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  179. pand mm3, mm2 /* use sharpen */
  180. paddsb mm6, mm1 /* clamping to high */
  181. psubsb mm6, mm1 /* offset back */
  182. por mm6, mm3 /* Mod value to be stored */
  183. pxor mm5, mm5 /* clear mm5 */
  184. pxor mm4, mm4 /* clear mm4 */
  185. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  186. psraw mm5, 8 /* sign extended */
  187. movq QWORD PTR [eax], mm5 /* writeout UDmod, low four */
  188. punpckhbw mm4, mm6
  189. psraw mm4, 8
  190. movq QWORD PTR [eax+8], mm4 /* writeout UDmod, high four */
  191. /* Right Mod */
  192. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  193. movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */
  194. movq mm5, mm3 /* make a copy of p */
  195. psubusb mm3, mm4 /* p-pu */
  196. psubusb mm4, mm5 /* pu-p */
  197. por mm3, mm4 /* abs(p-pu) */
  198. movq mm6, mm0 /* 32+QValues */
  199. movq mm4, mm0 /* 32+QValues */
  200. psubusb mm6, mm3 /* zero clampled TmpMod */
  201. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  202. paddb mm4, eight64s /* 32+QValues + 64 */
  203. pxor mm4, mm5 /* convert to a sign number */
  204. pxor mm3, mm5 /* convert to a sign number */
  205. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  206. pand mm3, mm2 /* use sharpen */
  207. paddsb mm6, mm1 /* clamping to high */
  208. psubsb mm6, mm1 /* offset back */
  209. por mm6, mm3 /* Mod value to be stored */
  210. pxor mm5, mm5 /* clear mm5 */
  211. pxor mm4, mm4 /* clear mm4 */
  212. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  213. psraw mm5, 8 /* sign extended */
  214. movq QWORD PTR [eax+128], mm5 /* writeout UDmod, low four */
  215. punpckhbw mm4, mm6
  216. psraw mm4, 8
  217. movq QWORD PTR [eax+136], mm4 /* writeout UDmod, high four */
  218. add esi, ecx
  219. add edi, 16
  220. add eax, 16
  221. cmp esi, ebx
  222. jne FillModLoop1
  223. /* last UDMod */
  224. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  225. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  226. movq mm5, mm3 /* make a copy of p */
  227. psubusb mm3, mm4 /* p-pu */
  228. psubusb mm4, mm5 /* pu-p */
  229. por mm3, mm4 /* abs(p-pu) */
  230. movq mm6, mm0 /* 32+QValues */
  231. movq mm4, mm0 /* 32+QValues */
  232. psubusb mm6, mm3 /* zero clampled TmpMod */
  233. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  234. paddb mm4, eight64s /* 32+QValues + 64 */
  235. pxor mm4, mm5 /* convert to a sign number */
  236. pxor mm3, mm5 /* convert to a sign number */
  237. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  238. pand mm3, mm2 /* use sharpen */
  239. paddsb mm6, mm1 /* clamping to high */
  240. psubsb mm6, mm1 /* offset back */
  241. por mm6, mm3 /* Mod value to be stored */
  242. pxor mm5, mm5 /* clear mm5 */
  243. pxor mm4, mm4 /* clear mm4 */
  244. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  245. psraw mm5, 8 /* sign extended */
  246. movq QWORD PTR [edi], mm5 /* writeout UDmod, low four */
  247. punpckhbw mm4, mm6
  248. psraw mm4, 8
  249. movq QWORD PTR [edi+8], mm4 /* writeout UDmod, high four */
  250. mov esi, Src
  251. mov edi, Des
  252. mov eax, UDPointer
  253. mov ebx, LRPointer
  254. /* First Row */
  255. movq mm0, [esi+edx] /* mm0 = Pixels above */
  256. pxor mm7, mm7 /* clear mm7 */
  257. movq mm1, mm0 /* make a copy of mm0 */
  258. punpcklbw mm0, mm7 /* lower four pixels */
  259. movq mm4, [eax] /* au */
  260. punpckhbw mm1, mm7 /* high four pixels */
  261. movq mm5, [eax+8] /* au */
  262. pmullw mm0, mm4 /* pu*au */
  263. movq mm2, [esi+ecx] /* mm2 = pixels below */
  264. pmullw mm1, mm5 /* pu*au */
  265. movq mm3, mm2 /* make a copy of mm2 */
  266. punpcklbw mm2, mm7 /* lower four */
  267. movq mm6, [eax+16] /* ad */
  268. punpckhbw mm3, mm7 /* higher four */
  269. paddw mm4, mm6 /* au+ad */
  270. pmullw mm2, mm6 /* au*pu+ad*pd */
  271. movq mm6, [eax+24] /* ad */
  272. paddw mm0, mm2
  273. paddw mm5, mm6 /* au+ad */
  274. pmullw mm3, mm6 /* ad*pd */
  275. movq mm2, [esi-1] /* pixel to the left */
  276. paddw mm1, mm3 /* au*pu+ad*pd */
  277. movq mm3, mm2 /* make a copy of mm2 */
  278. punpcklbw mm2, mm7 /* four left pixels */
  279. movq mm6, [ebx] /* al */
  280. punpckhbw mm3, mm7 /* four right pixels */
  281. paddw mm4, mm6 /* au + ad + al */
  282. pmullw mm2, mm6 /* pl * al */
  283. movq mm6, [ebx+8] /* al */
  284. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  285. paddw mm5, mm6 /* au+ad+al */
  286. pmullw mm3, mm6 /* al*pl */
  287. movq mm2, [esi+1] /* pixel to the right */
  288. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  289. movq mm3, mm2 /* make a copy of mm2 */
  290. punpcklbw mm2, mm7 /* four left pixels */
  291. movq mm6, [ebx+128] /* ar */
  292. punpckhbw mm3, mm7 /* four right pixels */
  293. paddw mm4, mm6 /* au + ad + al + ar */
  294. pmullw mm2, mm6 /* pr * ar */
  295. movq mm6, [ebx+136] /* ar */
  296. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  297. paddw mm5, mm6 /* au+ad+al+ar */
  298. pmullw mm3, mm6 /* ar*pr */
  299. movq mm2, [esi] /* p */
  300. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  301. movq mm3, mm2 /* make a copy of the pixel */
  302. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  303. /* mm4, mm5 --- au + ad + al + ar */
  304. punpcklbw mm2, mm7 /* left four pixels */
  305. movq mm6, Four128s /* 0080 0080 0080 0080 */
  306. punpckhbw mm3, mm7 /* right four pixels */
  307. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  308. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  309. movq mm6, Four128s /* 0080 0080 0080 0080 */
  310. paddw mm0, mm2 /* sum */
  311. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  312. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  313. movq mm6, Four64s /* {64, 64, 64, 64 } */
  314. movq mm7, mm6 /* {64, 64, 64, 64} */
  315. paddw mm0, mm6 /* sum+B */
  316. paddw mm1, mm3 /* sum */
  317. psllw mm7, 8 /* {16384, .. } */
  318. paddw mm0, mm7 /* clamping */
  319. paddw mm1, mm6 /* sum+B */
  320. paddw mm1, mm7 /* clamping */
  321. psubusw mm0, mm7 /* clamping */
  322. psubusw mm1, mm7 /* clamping */
  323. psrlw mm0, 7 /* (sum+B)>>7 */
  324. psrlw mm1, 7 /* (sum+B)>>7 */
  325. packuswb mm0, mm1 /* pack to 8 bytes */
  326. movq [edi], mm0 /* write to destination */
  327. add esi, ecx /* Src += Pitch */
  328. add edi, ecx /* Des += Pitch */
  329. add eax, 16 /* UDPointer += 8 */
  330. add ebx, 16 /* LPointer +=8 */
  331. /* Second Row */
  332. movq mm0, [esi+edx] /* mm0 = Pixels above */
  333. pxor mm7, mm7 /* clear mm7 */
  334. movq mm1, mm0 /* make a copy of mm0 */
  335. punpcklbw mm0, mm7 /* lower four pixels */
  336. movq mm4, [eax] /* au */
  337. punpckhbw mm1, mm7 /* high four pixels */
  338. movq mm5, [eax+8] /* au */
  339. pmullw mm0, mm4 /* pu*au */
  340. movq mm2, [esi+ecx] /* mm2 = pixels below */
  341. pmullw mm1, mm5 /* pu*au */
  342. movq mm3, mm2 /* make a copy of mm2 */
  343. punpcklbw mm2, mm7 /* lower four */
  344. movq mm6, [eax+16] /* ad */
  345. punpckhbw mm3, mm7 /* higher four */
  346. paddw mm4, mm6 /* au+ad */
  347. pmullw mm2, mm6 /* au*pu+ad*pd */
  348. movq mm6, [eax+24] /* ad */
  349. paddw mm0, mm2
  350. paddw mm5, mm6 /* au+ad */
  351. pmullw mm3, mm6 /* ad*pd */
  352. movq mm2, [esi-1] /* pixel to the left */
  353. paddw mm1, mm3 /* au*pu+ad*pd */
  354. movq mm3, mm2 /* make a copy of mm2 */
  355. punpcklbw mm2, mm7 /* four left pixels */
  356. movq mm6, [ebx] /* al */
  357. punpckhbw mm3, mm7 /* four right pixels */
  358. paddw mm4, mm6 /* au + ad + al */
  359. pmullw mm2, mm6 /* pl * al */
  360. movq mm6, [ebx+8] /* al */
  361. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  362. paddw mm5, mm6 /* au+ad+al */
  363. pmullw mm3, mm6 /* al*pl */
  364. movq mm2, [esi+1] /* pixel to the right */
  365. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  366. movq mm3, mm2 /* make a copy of mm2 */
  367. punpcklbw mm2, mm7 /* four left pixels */
  368. movq mm6, [ebx+128] /* ar */
  369. punpckhbw mm3, mm7 /* four right pixels */
  370. paddw mm4, mm6 /* au + ad + al + ar */
  371. pmullw mm2, mm6 /* pr * ar */
  372. movq mm6, [ebx+136] /* ar */
  373. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  374. paddw mm5, mm6 /* au+ad+al+ar */
  375. pmullw mm3, mm6 /* ar*pr */
  376. movq mm2, [esi] /* p */
  377. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  378. movq mm3, mm2 /* make a copy of the pixel */
  379. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  380. /* mm4, mm5 --- au + ad + al + ar */
  381. punpcklbw mm2, mm7 /* left four pixels */
  382. movq mm6, Four128s /* 0080 0080 0080 0080 */
  383. punpckhbw mm3, mm7 /* right four pixels */
  384. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  385. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  386. movq mm6, Four128s /* 0080 0080 0080 0080 */
  387. paddw mm0, mm2 /* sum */
  388. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  389. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  390. movq mm6, Four64s /* {64, 64, 64, 64 } */
  391. movq mm7, mm6 /* {64, 64, 64, 64} */
  392. paddw mm0, mm6 /* sum+B */
  393. paddw mm1, mm3 /* sum */
  394. psllw mm7, 8 /* {16384, .. } */
  395. paddw mm0, mm7 /* clamping */
  396. paddw mm1, mm6 /* sum+B */
  397. paddw mm1, mm7 /* clamping */
  398. psubusw mm0, mm7 /* clamping */
  399. psubusw mm1, mm7 /* clamping */
  400. psrlw mm0, 7 /* (sum+B)>>7 */
  401. psrlw mm1, 7 /* (sum+B)>>7 */
  402. packuswb mm0, mm1 /* pack to 8 bytes */
  403. movq [edi], mm0 /* write to destination */
  404. add esi, ecx /* Src += Pitch */
  405. add edi, ecx /* Des += Pitch */
  406. add eax, 16 /* UDPointer += 8 */
  407. add ebx, 16 /* LPointer +=8 */
  408. /* Third Row */
  409. movq mm0, [esi+edx] /* mm0 = Pixels above */
  410. pxor mm7, mm7 /* clear mm7 */
  411. movq mm1, mm0 /* make a copy of mm0 */
  412. punpcklbw mm0, mm7 /* lower four pixels */
  413. movq mm4, [eax] /* au */
  414. punpckhbw mm1, mm7 /* high four pixels */
  415. movq mm5, [eax+8] /* au */
  416. pmullw mm0, mm4 /* pu*au */
  417. movq mm2, [esi+ecx] /* mm2 = pixels below */
  418. pmullw mm1, mm5 /* pu*au */
  419. movq mm3, mm2 /* make a copy of mm2 */
  420. punpcklbw mm2, mm7 /* lower four */
  421. movq mm6, [eax+16] /* ad */
  422. punpckhbw mm3, mm7 /* higher four */
  423. paddw mm4, mm6 /* au+ad */
  424. pmullw mm2, mm6 /* au*pu+ad*pd */
  425. movq mm6, [eax+24] /* ad */
  426. paddw mm0, mm2
  427. paddw mm5, mm6 /* au+ad */
  428. pmullw mm3, mm6 /* ad*pd */
  429. movq mm2, [esi-1] /* pixel to the left */
  430. paddw mm1, mm3 /* au*pu+ad*pd */
  431. movq mm3, mm2 /* make a copy of mm2 */
  432. punpcklbw mm2, mm7 /* four left pixels */
  433. movq mm6, [ebx] /* al */
  434. punpckhbw mm3, mm7 /* four right pixels */
  435. paddw mm4, mm6 /* au + ad + al */
  436. pmullw mm2, mm6 /* pl * al */
  437. movq mm6, [ebx+8] /* al */
  438. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  439. paddw mm5, mm6 /* au+ad+al */
  440. pmullw mm3, mm6 /* al*pl */
  441. movq mm2, [esi+1] /* pixel to the right */
  442. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  443. movq mm3, mm2 /* make a copy of mm2 */
  444. punpcklbw mm2, mm7 /* four left pixels */
  445. movq mm6, [ebx+128] /* ar */
  446. punpckhbw mm3, mm7 /* four right pixels */
  447. paddw mm4, mm6 /* au + ad + al + ar */
  448. pmullw mm2, mm6 /* pr * ar */
  449. movq mm6, [ebx+136] /* ar */
  450. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  451. paddw mm5, mm6 /* au+ad+al+ar */
  452. pmullw mm3, mm6 /* ar*pr */
  453. movq mm2, [esi] /* p */
  454. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  455. movq mm3, mm2 /* make a copy of the pixel */
  456. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  457. /* mm4, mm5 --- au + ad + al + ar */
  458. punpcklbw mm2, mm7 /* left four pixels */
  459. movq mm6, Four128s /* 0080 0080 0080 0080 */
  460. punpckhbw mm3, mm7 /* right four pixels */
  461. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  462. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  463. movq mm6, Four128s /* 0080 0080 0080 0080 */
  464. paddw mm0, mm2 /* sum */
  465. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  466. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  467. movq mm6, Four64s /* {64, 64, 64, 64 } */
  468. movq mm7, mm6 /* {64, 64, 64, 64} */
  469. paddw mm0, mm6 /* sum+B */
  470. paddw mm1, mm3 /* sum */
  471. psllw mm7, 8 /* {16384, .. } */
  472. paddw mm0, mm7 /* clamping */
  473. paddw mm1, mm6 /* sum+B */
  474. paddw mm1, mm7 /* clamping */
  475. psubusw mm0, mm7 /* clamping */
  476. psubusw mm1, mm7 /* clamping */
  477. psrlw mm0, 7 /* (sum+B)>>7 */
  478. psrlw mm1, 7 /* (sum+B)>>7 */
  479. packuswb mm0, mm1 /* pack to 8 bytes */
  480. movq [edi], mm0 /* write to destination */
  481. add esi, ecx /* Src += Pitch */
  482. add edi, ecx /* Des += Pitch */
  483. add eax, 16 /* UDPointer += 8 */
  484. add ebx, 16 /* LPointer +=8 */
  485. /* Fourth Row */
  486. movq mm0, [esi+edx] /* mm0 = Pixels above */
  487. pxor mm7, mm7 /* clear mm7 */
  488. movq mm1, mm0 /* make a copy of mm0 */
  489. punpcklbw mm0, mm7 /* lower four pixels */
  490. movq mm4, [eax] /* au */
  491. punpckhbw mm1, mm7 /* high four pixels */
  492. movq mm5, [eax+8] /* au */
  493. pmullw mm0, mm4 /* pu*au */
  494. movq mm2, [esi+ecx] /* mm2 = pixels below */
  495. pmullw mm1, mm5 /* pu*au */
  496. movq mm3, mm2 /* make a copy of mm2 */
  497. punpcklbw mm2, mm7 /* lower four */
  498. movq mm6, [eax+16] /* ad */
  499. punpckhbw mm3, mm7 /* higher four */
  500. paddw mm4, mm6 /* au+ad */
  501. pmullw mm2, mm6 /* au*pu+ad*pd */
  502. movq mm6, [eax+24] /* ad */
  503. paddw mm0, mm2
  504. paddw mm5, mm6 /* au+ad */
  505. pmullw mm3, mm6 /* ad*pd */
  506. movq mm2, [esi-1] /* pixel to the left */
  507. paddw mm1, mm3 /* au*pu+ad*pd */
  508. movq mm3, mm2 /* make a copy of mm2 */
  509. punpcklbw mm2, mm7 /* four left pixels */
  510. movq mm6, [ebx] /* al */
  511. punpckhbw mm3, mm7 /* four right pixels */
  512. paddw mm4, mm6 /* au + ad + al */
  513. pmullw mm2, mm6 /* pl * al */
  514. movq mm6, [ebx+8] /* al */
  515. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  516. paddw mm5, mm6 /* au+ad+al */
  517. pmullw mm3, mm6 /* al*pl */
  518. movq mm2, [esi+1] /* pixel to the right */
  519. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  520. movq mm3, mm2 /* make a copy of mm2 */
  521. punpcklbw mm2, mm7 /* four left pixels */
  522. movq mm6, [ebx+128] /* ar */
  523. punpckhbw mm3, mm7 /* four right pixels */
  524. paddw mm4, mm6 /* au + ad + al + ar */
  525. pmullw mm2, mm6 /* pr * ar */
  526. movq mm6, [ebx+136] /* ar */
  527. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  528. paddw mm5, mm6 /* au+ad+al+ar */
  529. pmullw mm3, mm6 /* ar*pr */
  530. movq mm2, [esi] /* p */
  531. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  532. movq mm3, mm2 /* make a copy of the pixel */
  533. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  534. /* mm4, mm5 --- au + ad + al + ar */
  535. punpcklbw mm2, mm7 /* left four pixels */
  536. movq mm6, Four128s /* 0080 0080 0080 0080 */
  537. punpckhbw mm3, mm7 /* right four pixels */
  538. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  539. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  540. movq mm6, Four128s /* 0080 0080 0080 0080 */
  541. paddw mm0, mm2 /* sum */
  542. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  543. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  544. movq mm6, Four64s /* {64, 64, 64, 64 } */
  545. movq mm7, mm6 /* {64, 64, 64, 64} */
  546. paddw mm0, mm6 /* sum+B */
  547. paddw mm1, mm3 /* sum */
  548. psllw mm7, 8 /* {16384, .. } */
  549. paddw mm0, mm7 /* clamping */
  550. paddw mm1, mm6 /* sum+B */
  551. paddw mm1, mm7 /* clamping */
  552. psubusw mm0, mm7 /* clamping */
  553. psubusw mm1, mm7 /* clamping */
  554. psrlw mm0, 7 /* (sum+B)>>7 */
  555. psrlw mm1, 7 /* (sum+B)>>7 */
  556. packuswb mm0, mm1 /* pack to 8 bytes */
  557. movq [edi], mm0 /* write to destination */
  558. add esi, ecx /* Src += Pitch */
  559. add edi, ecx /* Des += Pitch */
  560. add eax, 16 /* UDPointer += 8 */
  561. add ebx, 16 /* LPointer +=8 */
  562. /* Fifth Row */
  563. movq mm0, [esi+edx] /* mm0 = Pixels above */
  564. pxor mm7, mm7 /* clear mm7 */
  565. movq mm1, mm0 /* make a copy of mm0 */
  566. punpcklbw mm0, mm7 /* lower four pixels */
  567. movq mm4, [eax] /* au */
  568. punpckhbw mm1, mm7 /* high four pixels */
  569. movq mm5, [eax+8] /* au */
  570. pmullw mm0, mm4 /* pu*au */
  571. movq mm2, [esi+ecx] /* mm2 = pixels below */
  572. pmullw mm1, mm5 /* pu*au */
  573. movq mm3, mm2 /* make a copy of mm2 */
  574. punpcklbw mm2, mm7 /* lower four */
  575. movq mm6, [eax+16] /* ad */
  576. punpckhbw mm3, mm7 /* higher four */
  577. paddw mm4, mm6 /* au+ad */
  578. pmullw mm2, mm6 /* au*pu+ad*pd */
  579. movq mm6, [eax+24] /* ad */
  580. paddw mm0, mm2
  581. paddw mm5, mm6 /* au+ad */
  582. pmullw mm3, mm6 /* ad*pd */
  583. movq mm2, [esi-1] /* pixel to the left */
  584. paddw mm1, mm3 /* au*pu+ad*pd */
  585. movq mm3, mm2 /* make a copy of mm2 */
  586. punpcklbw mm2, mm7 /* four left pixels */
  587. movq mm6, [ebx] /* al */
  588. punpckhbw mm3, mm7 /* four right pixels */
  589. paddw mm4, mm6 /* au + ad + al */
  590. pmullw mm2, mm6 /* pl * al */
  591. movq mm6, [ebx+8] /* al */
  592. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  593. paddw mm5, mm6 /* au+ad+al */
  594. pmullw mm3, mm6 /* al*pl */
  595. movq mm2, [esi+1] /* pixel to the right */
  596. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  597. movq mm3, mm2 /* make a copy of mm2 */
  598. punpcklbw mm2, mm7 /* four left pixels */
  599. movq mm6, [ebx+128] /* ar */
  600. punpckhbw mm3, mm7 /* four right pixels */
  601. paddw mm4, mm6 /* au + ad + al + ar */
  602. pmullw mm2, mm6 /* pr * ar */
  603. movq mm6, [ebx+136] /* ar */
  604. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  605. paddw mm5, mm6 /* au+ad+al+ar */
  606. pmullw mm3, mm6 /* ar*pr */
  607. movq mm2, [esi] /* p */
  608. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  609. movq mm3, mm2 /* make a copy of the pixel */
  610. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  611. /* mm4, mm5 --- au + ad + al + ar */
  612. punpcklbw mm2, mm7 /* left four pixels */
  613. movq mm6, Four128s /* 0080 0080 0080 0080 */
  614. punpckhbw mm3, mm7 /* right four pixels */
  615. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  616. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  617. movq mm6, Four128s /* 0080 0080 0080 0080 */
  618. paddw mm0, mm2 /* sum */
  619. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  620. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  621. movq mm6, Four64s /* {64, 64, 64, 64 } */
  622. movq mm7, mm6 /* {64, 64, 64, 64} */
  623. paddw mm0, mm6 /* sum+B */
  624. paddw mm1, mm3 /* sum */
  625. psllw mm7, 8 /* {16384, .. } */
  626. paddw mm0, mm7 /* clamping */
  627. paddw mm1, mm6 /* sum+B */
  628. paddw mm1, mm7 /* clamping */
  629. psubusw mm0, mm7 /* clamping */
  630. psubusw mm1, mm7 /* clamping */
  631. psrlw mm0, 7 /* (sum+B)>>7 */
  632. psrlw mm1, 7 /* (sum+B)>>7 */
  633. packuswb mm0, mm1 /* pack to 8 bytes */
  634. movq [edi], mm0 /* write to destination */
  635. add esi, ecx /* Src += Pitch */
  636. add edi, ecx /* Des += Pitch */
  637. add eax, 16 /* UDPointer += 8 */
  638. add ebx, 16 /* LPointer +=8 */
  639. /* Sixth Row */
  640. movq mm0, [esi+edx] /* mm0 = Pixels above */
  641. pxor mm7, mm7 /* clear mm7 */
  642. movq mm1, mm0 /* make a copy of mm0 */
  643. punpcklbw mm0, mm7 /* lower four pixels */
  644. movq mm4, [eax] /* au */
  645. punpckhbw mm1, mm7 /* high four pixels */
  646. movq mm5, [eax+8] /* au */
  647. pmullw mm0, mm4 /* pu*au */
  648. movq mm2, [esi+ecx] /* mm2 = pixels below */
  649. pmullw mm1, mm5 /* pu*au */
  650. movq mm3, mm2 /* make a copy of mm2 */
  651. punpcklbw mm2, mm7 /* lower four */
  652. movq mm6, [eax+16] /* ad */
  653. punpckhbw mm3, mm7 /* higher four */
  654. paddw mm4, mm6 /* au+ad */
  655. pmullw mm2, mm6 /* au*pu+ad*pd */
  656. movq mm6, [eax+24] /* ad */
  657. paddw mm0, mm2
  658. paddw mm5, mm6 /* au+ad */
  659. pmullw mm3, mm6 /* ad*pd */
  660. movq mm2, [esi-1] /* pixel to the left */
  661. paddw mm1, mm3 /* au*pu+ad*pd */
  662. movq mm3, mm2 /* make a copy of mm2 */
  663. punpcklbw mm2, mm7 /* four left pixels */
  664. movq mm6, [ebx] /* al */
  665. punpckhbw mm3, mm7 /* four right pixels */
  666. paddw mm4, mm6 /* au + ad + al */
  667. pmullw mm2, mm6 /* pl * al */
  668. movq mm6, [ebx+8] /* al */
  669. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  670. paddw mm5, mm6 /* au+ad+al */
  671. pmullw mm3, mm6 /* al*pl */
  672. movq mm2, [esi+1] /* pixel to the right */
  673. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  674. movq mm3, mm2 /* make a copy of mm2 */
  675. punpcklbw mm2, mm7 /* four left pixels */
  676. movq mm6, [ebx+128] /* ar */
  677. punpckhbw mm3, mm7 /* four right pixels */
  678. paddw mm4, mm6 /* au + ad + al + ar */
  679. pmullw mm2, mm6 /* pr * ar */
  680. movq mm6, [ebx+136] /* ar */
  681. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  682. paddw mm5, mm6 /* au+ad+al+ar */
  683. pmullw mm3, mm6 /* ar*pr */
  684. movq mm2, [esi] /* p */
  685. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  686. movq mm3, mm2 /* make a copy of the pixel */
  687. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  688. /* mm4, mm5 --- au + ad + al + ar */
  689. punpcklbw mm2, mm7 /* left four pixels */
  690. movq mm6, Four128s /* 0080 0080 0080 0080 */
  691. punpckhbw mm3, mm7 /* right four pixels */
  692. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  693. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  694. movq mm6, Four128s /* 0080 0080 0080 0080 */
  695. paddw mm0, mm2 /* sum */
  696. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  697. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  698. movq mm6, Four64s /* {64, 64, 64, 64 } */
  699. movq mm7, mm6 /* {64, 64, 64, 64} */
  700. paddw mm0, mm6 /* sum+B */
  701. paddw mm1, mm3 /* sum */
  702. psllw mm7, 8 /* {16384, .. } */
  703. paddw mm0, mm7 /* clamping */
  704. paddw mm1, mm6 /* sum+B */
  705. paddw mm1, mm7 /* clamping */
  706. psubusw mm0, mm7 /* clamping */
  707. psubusw mm1, mm7 /* clamping */
  708. psrlw mm0, 7 /* (sum+B)>>7 */
  709. psrlw mm1, 7 /* (sum+B)>>7 */
  710. packuswb mm0, mm1 /* pack to 8 bytes */
  711. movq [edi], mm0 /* write to destination */
  712. add esi, ecx /* Src += Pitch */
  713. add edi, ecx /* Des += Pitch */
  714. add eax, 16 /* UDPointer += 8 */
  715. add ebx, 16 /* LPointer +=8 */
  716. /* Seventh Row */
  717. movq mm0, [esi+edx] /* mm0 = Pixels above */
  718. pxor mm7, mm7 /* clear mm7 */
  719. movq mm1, mm0 /* make a copy of mm0 */
  720. punpcklbw mm0, mm7 /* lower four pixels */
  721. movq mm4, [eax] /* au */
  722. punpckhbw mm1, mm7 /* high four pixels */
  723. movq mm5, [eax+8] /* au */
  724. pmullw mm0, mm4 /* pu*au */
  725. movq mm2, [esi+ecx] /* mm2 = pixels below */
  726. pmullw mm1, mm5 /* pu*au */
  727. movq mm3, mm2 /* make a copy of mm2 */
  728. punpcklbw mm2, mm7 /* lower four */
  729. movq mm6, [eax+16] /* ad */
  730. punpckhbw mm3, mm7 /* higher four */
  731. paddw mm4, mm6 /* au+ad */
  732. pmullw mm2, mm6 /* au*pu+ad*pd */
  733. movq mm6, [eax+24] /* ad */
  734. paddw mm0, mm2
  735. paddw mm5, mm6 /* au+ad */
  736. pmullw mm3, mm6 /* ad*pd */
  737. movq mm2, [esi-1] /* pixel to the left */
  738. paddw mm1, mm3 /* au*pu+ad*pd */
  739. movq mm3, mm2 /* make a copy of mm2 */
  740. punpcklbw mm2, mm7 /* four left pixels */
  741. movq mm6, [ebx] /* al */
  742. punpckhbw mm3, mm7 /* four right pixels */
  743. paddw mm4, mm6 /* au + ad + al */
  744. pmullw mm2, mm6 /* pl * al */
  745. movq mm6, [ebx+8] /* al */
  746. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  747. paddw mm5, mm6 /* au+ad+al */
  748. pmullw mm3, mm6 /* al*pl */
  749. movq mm2, [esi+1] /* pixel to the right */
  750. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  751. movq mm3, mm2 /* make a copy of mm2 */
  752. punpcklbw mm2, mm7 /* four left pixels */
  753. movq mm6, [ebx+128] /* ar */
  754. punpckhbw mm3, mm7 /* four right pixels */
  755. paddw mm4, mm6 /* au + ad + al + ar */
  756. pmullw mm2, mm6 /* pr * ar */
  757. movq mm6, [ebx+136] /* ar */
  758. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  759. paddw mm5, mm6 /* au+ad+al+ar */
  760. pmullw mm3, mm6 /* ar*pr */
  761. movq mm2, [esi] /* p */
  762. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  763. movq mm3, mm2 /* make a copy of the pixel */
  764. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  765. /* mm4, mm5 --- au + ad + al + ar */
  766. punpcklbw mm2, mm7 /* left four pixels */
  767. movq mm6, Four128s /* 0080 0080 0080 0080 */
  768. punpckhbw mm3, mm7 /* right four pixels */
  769. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  770. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  771. movq mm6, Four128s /* 0080 0080 0080 0080 */
  772. paddw mm0, mm2 /* sum */
  773. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  774. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  775. movq mm6, Four64s /* {64, 64, 64, 64 } */
  776. movq mm7, mm6 /* {64, 64, 64, 64} */
  777. paddw mm0, mm6 /* sum+B */
  778. paddw mm1, mm3 /* sum */
  779. psllw mm7, 8 /* {16384, .. } */
  780. paddw mm0, mm7 /* clamping */
  781. paddw mm1, mm6 /* sum+B */
  782. paddw mm1, mm7 /* clamping */
  783. psubusw mm0, mm7 /* clamping */
  784. psubusw mm1, mm7 /* clamping */
  785. psrlw mm0, 7 /* (sum+B)>>7 */
  786. psrlw mm1, 7 /* (sum+B)>>7 */
  787. packuswb mm0, mm1 /* pack to 8 bytes */
  788. movq [edi], mm0 /* write to destination */
  789. add esi, ecx /* Src += Pitch */
  790. add edi, ecx /* Des += Pitch */
  791. add eax, 16 /* UDPointer += 8 */
  792. add ebx, 16 /* LPointer +=8 */
  793. /* Eighth Row */
  794. movq mm0, [esi+edx] /* mm0 = Pixels above */
  795. pxor mm7, mm7 /* clear mm7 */
  796. movq mm1, mm0 /* make a copy of mm0 */
  797. punpcklbw mm0, mm7 /* lower four pixels */
  798. movq mm4, [eax] /* au */
  799. punpckhbw mm1, mm7 /* high four pixels */
  800. movq mm5, [eax+8] /* au */
  801. pmullw mm0, mm4 /* pu*au */
  802. movq mm2, [esi+ecx] /* mm2 = pixels below */
  803. pmullw mm1, mm5 /* pu*au */
  804. movq mm3, mm2 /* make a copy of mm2 */
  805. punpcklbw mm2, mm7 /* lower four */
  806. movq mm6, [eax+16] /* ad */
  807. punpckhbw mm3, mm7 /* higher four */
  808. paddw mm4, mm6 /* au+ad */
  809. pmullw mm2, mm6 /* au*pu+ad*pd */
  810. movq mm6, [eax+24] /* ad */
  811. paddw mm0, mm2
  812. paddw mm5, mm6 /* au+ad */
  813. pmullw mm3, mm6 /* ad*pd */
  814. movq mm2, [esi-1] /* pixel to the left */
  815. paddw mm1, mm3 /* au*pu+ad*pd */
  816. movq mm3, mm2 /* make a copy of mm2 */
  817. punpcklbw mm2, mm7 /* four left pixels */
  818. movq mm6, [ebx] /* al */
  819. punpckhbw mm3, mm7 /* four right pixels */
  820. paddw mm4, mm6 /* au + ad + al */
  821. pmullw mm2, mm6 /* pl * al */
  822. movq mm6, [ebx+8] /* al */
  823. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  824. paddw mm5, mm6 /* au+ad+al */
  825. pmullw mm3, mm6 /* al*pl */
  826. movq mm2, [esi+1] /* pixel to the right */
  827. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  828. movq mm3, mm2 /* make a copy of mm2 */
  829. punpcklbw mm2, mm7 /* four left pixels */
  830. movq mm6, [ebx+128] /* ar */
  831. punpckhbw mm3, mm7 /* four right pixels */
  832. paddw mm4, mm6 /* au + ad + al + ar */
  833. pmullw mm2, mm6 /* pr * ar */
  834. movq mm6, [ebx+136] /* ar */
  835. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  836. paddw mm5, mm6 /* au+ad+al+ar */
  837. pmullw mm3, mm6 /* ar*pr */
  838. movq mm2, [esi] /* p */
  839. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  840. movq mm3, mm2 /* make a copy of the pixel */
  841. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  842. /* mm4, mm5 --- au + ad + al + ar */
  843. punpcklbw mm2, mm7 /* left four pixels */
  844. movq mm6, Four128s /* 0080 0080 0080 0080 */
  845. punpckhbw mm3, mm7 /* right four pixels */
  846. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  847. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  848. movq mm6, Four128s /* 0080 0080 0080 0080 */
  849. paddw mm0, mm2 /* sum */
  850. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  851. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  852. movq mm6, Four64s /* {64, 64, 64, 64 } */
  853. movq mm7, mm6 /* {64, 64, 64, 64} */
  854. paddw mm0, mm6 /* sum+B */
  855. paddw mm1, mm3 /* sum */
  856. psllw mm7, 8 /* {16384, .. } */
  857. paddw mm0, mm7 /* clamping */
  858. paddw mm1, mm6 /* sum+B */
  859. paddw mm1, mm7 /* clamping */
  860. psubusw mm0, mm7 /* clamping */
  861. psubusw mm1, mm7 /* clamping */
  862. psrlw mm0, 7 /* (sum+B)>>7 */
  863. psrlw mm1, 7 /* (sum+B)>>7 */
  864. packuswb mm0, mm1 /* pack to 8 bytes */
  865. movq [edi], mm0 /* write to destination */
  866. pop ebx
  867. pop eax
  868. pop edx
  869. pop ecx
  870. pop edi
  871. pop esi
  872. }
  873. }
  874. /****************************************************************************
  875. *
  876. * ROUTINE : DeRingBlockWeak_MMX
  877. *
  878. * INPUTS : const POSTPROC_INSTANCE *pbi : Pointer to post-processor instance.
  879. * const UINT8 *SrcPtr : Pointer to input image.
  880. * UINT8 *DstPtr : Pointer to output image.
  881. * const INT32 Pitch : Image stride.
  882. * UINT32 FragQIndex : Q-index block encoded with.
  883. * UINT32 *QuantScale : Array of quantization scale factors.
  884. *
  885. * OUTPUTS : None.
  886. *
  887. * RETURNS : void
  888. *
  889. * FUNCTION : Filters a block for de-ringing purpose.
  890. *
  891. * SPECIAL NOTES : None.
  892. *
  893. ****************************************************************************/
  894. void DeringBlockWeak_MMX
  895. (
  896. const POSTPROC_INSTANCE *pbi,
  897. const UINT8 *SrcPtr,
  898. UINT8 *DstPtr,
  899. const INT32 Pitch,
  900. UINT32 FragQIndex,
  901. UINT32 *QuantScale
  902. )
  903. {
  904. #if defined(_WIN32_WCE)
  905. #pragma pack(16)
  906. short UDMod[72];
  907. short LRMod[128];
  908. #pragma pack()
  909. #else
  910. __declspec(align(16)) short UDMod[72];
  911. __declspec(align(16)) short LRMod[128];
  912. #endif
  913. unsigned int PlaneLineStep = Pitch;
  914. const unsigned char *Src = SrcPtr;
  915. unsigned char *Des = DstPtr;
  916. short *UDPointer = UDMod;
  917. short *LRPointer = LRMod;
  918. UINT32 QStep = QuantScale[FragQIndex];
  919. INT32 Sharpen = SharpenModifier[FragQIndex];
  920. (void) pbi;
  921. __asm
  922. {
  923. push esi
  924. push edi
  925. mov esi, Src /* Source Pointer */
  926. mov edi, UDPointer /* UD modifier pointer */
  927. push ecx
  928. push edx
  929. mov ecx, PlaneLineStep /* Pitch Step */
  930. xor edx, edx
  931. push eax
  932. push ebx
  933. mov eax, QStep /* QValue */
  934. mov ebx, Sharpen /* Sharpen */
  935. movd mm0, eax /* QValue */
  936. movd mm2, ebx /* sharpen */
  937. punpcklbw mm0, mm0 /* 00 00 00 QQ */
  938. sub edx, ecx /* Negative Pitch */
  939. punpcklbw mm2, mm2 /* 00 00 00 SS */
  940. pxor mm7, mm7 /* clear mm7 for unpacks */
  941. punpcklbw mm0, mm0 /* 00 00 qq qq */
  942. mov eax, LRPointer /* Left and Right Modifier */
  943. punpcklbw mm2, mm2 /* 00 00 ss ss */
  944. lea ebx, [esi+ecx*8] /* Source Pointer of last row */
  945. punpcklbw mm0, mm0 /* qq qq qq qq */
  946. movq mm1, mm0; /* make a copy */
  947. punpcklbw mm2, mm2 /* ss ss ss ss */
  948. paddb mm1, mm0 /* QValue * 2 */
  949. paddb mm1, mm0 /* High = 3 * Qvalue */
  950. paddusb mm1, eight231s /* clamping high to 24 */
  951. paddb mm0, eight32s /* 32+QValues */
  952. psubusb mm1, eight231s /* Get the real value back */
  953. movq mm3, eight127s /* 7f 7f 7f 7f 7f 7f 7f 7f */
  954. pandn mm1, mm3 /* ClampHigh */
  955. /* mm0,mm1,mm2,mm7 are in use */
  956. /* mm0---> QValue+32 */
  957. /* mm1---> ClampHigh */
  958. /* mm2---> Sharpen */
  959. /* mm7---> Cleared for unpack */
  960. FillModLoop1:
  961. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  962. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  963. movq mm5, mm3 /* make a copy of p */
  964. psubusb mm3, mm4 /* p-pu */
  965. psubusb mm4, mm5 /* pu-p */
  966. por mm3, mm4 /* abs(p-pu) */
  967. movq mm6, mm0 /* 32+QValues */
  968. paddusb mm3, mm3 /* 2*abs(p-pu) */
  969. movq mm4, mm0 /* 32+QValues */
  970. psubusb mm6, mm3 /* zero clampled TmpMod */
  971. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  972. paddb mm4, eight64s /* 32+QValues + 64 */
  973. pxor mm4, mm5 /* convert to a sign number */
  974. pxor mm3, mm5 /* convert to a sign number */
  975. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  976. pand mm3, mm2 /* use sharpen */
  977. paddsb mm6, mm1 /* clamping to high */
  978. psubsb mm6, mm1 /* offset back */
  979. por mm6, mm3 /* Mod value to be stored */
  980. pxor mm5, mm5 /* clear mm5 */
  981. pxor mm4, mm4 /* clear mm4 */
  982. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  983. psraw mm5, 8 /* sign extended */
  984. movq QWORD PTR [edi], mm5 /* writeout UDmod, low four */
  985. punpckhbw mm4, mm6
  986. psraw mm4, 8
  987. movq QWORD PTR [edi+8], mm4 /* writeout UDmod, high four */
  988. /* left Mod */
  989. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  990. movq mm4, QWORD PTR [esi-1] /* Pixels on top pu */
  991. movq mm5, mm3 /* make a copy of p */
  992. psubusb mm3, mm4 /* p-pu */
  993. psubusb mm4, mm5 /* pu-p */
  994. por mm3, mm4 /* abs(p-pu) */
  995. movq mm6, mm0 /* 32+QValues */
  996. paddusb mm3, mm3 /* 2*abs(p-pu) */
  997. movq mm4, mm0 /* 32+QValues */
  998. psubusb mm6, mm3 /* zero clampled TmpMod */
  999. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  1000. paddb mm4, eight64s /* 32+QValues + 64 */
  1001. pxor mm4, mm5 /* convert to a sign number */
  1002. pxor mm3, mm5 /* convert to a sign number */
  1003. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  1004. pand mm3, mm2 /* use sharpen */
  1005. paddsb mm6, mm1 /* clamping to high */
  1006. psubsb mm6, mm1 /* offset back */
  1007. por mm6, mm3 /* Mod value to be stored */
  1008. pxor mm5, mm5 /* clear mm5 */
  1009. pxor mm4, mm4 /* clear mm4 */
  1010. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  1011. psraw mm5, 8 /* sign extended */
  1012. movq QWORD PTR [eax], mm5 /* writeout UDmod, low four */
  1013. punpckhbw mm4, mm6
  1014. psraw mm4, 8
  1015. movq QWORD PTR [eax+8], mm4 /* writeout UDmod, high four */
  1016. /* Right Mod */
  1017. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  1018. movq mm4, QWORD PTR [esi+1] /* Pixels on top pu */
  1019. movq mm5, mm3 /* make a copy of p */
  1020. psubusb mm3, mm4 /* p-pu */
  1021. psubusb mm4, mm5 /* pu-p */
  1022. por mm3, mm4 /* abs(p-pu) */
  1023. movq mm6, mm0 /* 32+QValues */
  1024. paddusb mm3, mm3 /* 2*abs(p-pu) */
  1025. movq mm4, mm0 /* 32+QValues */
  1026. psubusb mm6, mm3 /* zero clampled TmpMod */
  1027. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  1028. paddb mm4, eight64s /* 32+QValues + 64 */
  1029. pxor mm4, mm5 /* convert to a sign number */
  1030. pxor mm3, mm5 /* convert to a sign number */
  1031. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  1032. pand mm3, mm2 /* use sharpen */
  1033. paddsb mm6, mm1 /* clamping to high */
  1034. psubsb mm6, mm1 /* offset back */
  1035. por mm6, mm3 /* Mod value to be stored */
  1036. pxor mm5, mm5 /* clear mm5 */
  1037. pxor mm4, mm4 /* clear mm4 */
  1038. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  1039. psraw mm5, 8 /* sign extended */
  1040. movq QWORD PTR [eax+128], mm5 /* writeout UDmod, low four */
  1041. punpckhbw mm4, mm6
  1042. psraw mm4, 8
  1043. movq QWORD PTR [eax+136], mm4 /* writeout UDmod, high four */
  1044. add esi, ecx
  1045. add edi, 16
  1046. add eax, 16
  1047. cmp esi, ebx
  1048. jne FillModLoop1
  1049. /* last UDMod */
  1050. movq mm3, QWORD PTR [esi] /* read 8 pixels p */
  1051. movq mm4, QWORD PTR [esi+edx] /* Pixels on top pu */
  1052. movq mm5, mm3 /* make a copy of p */
  1053. psubusb mm3, mm4 /* p-pu */
  1054. psubusb mm4, mm5 /* pu-p */
  1055. por mm3, mm4 /* abs(p-pu) */
  1056. movq mm6, mm0 /* 32+QValues */
  1057. paddusb mm3, mm3 /* 2*abs(p-pu) */
  1058. movq mm4, mm0 /* 32+QValues */
  1059. psubusb mm6, mm3 /* zero clampled TmpMod */
  1060. movq mm5, eight128s /* 80 80 80 80 80 80 80 80 */
  1061. paddb mm4, eight64s /* 32+QValues + 64 */
  1062. pxor mm4, mm5 /* convert to a sign number */
  1063. pxor mm3, mm5 /* convert to a sign number */
  1064. pcmpgtb mm3, mm4 /* 32+QValue- 2*abs(p-pu) <-64 ? */
  1065. pand mm3, mm2 /* use sharpen */
  1066. paddsb mm6, mm1 /* clamping to high */
  1067. psubsb mm6, mm1 /* offset back */
  1068. por mm6, mm3 /* Mod value to be stored */
  1069. pxor mm5, mm5 /* clear mm5 */
  1070. pxor mm4, mm4 /* clear mm4 */
  1071. punpcklbw mm5, mm6 /* 03 xx 02 xx 01 xx 00 xx */
  1072. psraw mm5, 8 /* sign extended */
  1073. movq QWORD PTR [edi], mm5 /* writeout UDmod, low four */
  1074. punpckhbw mm4, mm6
  1075. psraw mm4, 8
  1076. movq QWORD PTR [edi+8], mm4 /* writeout UDmod, high four */
  1077. mov esi, Src
  1078. mov edi, Des
  1079. mov eax, UDPointer
  1080. mov ebx, LRPointer
  1081. /* First Row */
  1082. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1083. pxor mm7, mm7 /* clear mm7 */
  1084. movq mm1, mm0 /* make a copy of mm0 */
  1085. punpcklbw mm0, mm7 /* lower four pixels */
  1086. movq mm4, [eax] /* au */
  1087. punpckhbw mm1, mm7 /* high four pixels */
  1088. movq mm5, [eax+8] /* au */
  1089. pmullw mm0, mm4 /* pu*au */
  1090. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1091. pmullw mm1, mm5 /* pu*au */
  1092. movq mm3, mm2 /* make a copy of mm2 */
  1093. punpcklbw mm2, mm7 /* lower four */
  1094. movq mm6, [eax+16] /* ad */
  1095. punpckhbw mm3, mm7 /* higher four */
  1096. paddw mm4, mm6 /* au+ad */
  1097. pmullw mm2, mm6 /* au*pu+ad*pd */
  1098. movq mm6, [eax+24] /* ad */
  1099. paddw mm0, mm2
  1100. paddw mm5, mm6 /* au+ad */
  1101. pmullw mm3, mm6 /* ad*pd */
  1102. movq mm2, [esi-1] /* pixel to the left */
  1103. paddw mm1, mm3 /* au*pu+ad*pd */
  1104. movq mm3, mm2 /* make a copy of mm2 */
  1105. punpcklbw mm2, mm7 /* four left pixels */
  1106. movq mm6, [ebx] /* al */
  1107. punpckhbw mm3, mm7 /* four right pixels */
  1108. paddw mm4, mm6 /* au + ad + al */
  1109. pmullw mm2, mm6 /* pl * al */
  1110. movq mm6, [ebx+8] /* al */
  1111. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1112. paddw mm5, mm6 /* au+ad+al */
  1113. pmullw mm3, mm6 /* al*pl */
  1114. movq mm2, [esi+1] /* pixel to the right */
  1115. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1116. movq mm3, mm2 /* make a copy of mm2 */
  1117. punpcklbw mm2, mm7 /* four left pixels */
  1118. movq mm6, [ebx+128] /* ar */
  1119. punpckhbw mm3, mm7 /* four right pixels */
  1120. paddw mm4, mm6 /* au + ad + al + ar */
  1121. pmullw mm2, mm6 /* pr * ar */
  1122. movq mm6, [ebx+136] /* ar */
  1123. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1124. paddw mm5, mm6 /* au+ad+al+ar */
  1125. pmullw mm3, mm6 /* ar*pr */
  1126. movq mm2, [esi] /* p */
  1127. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1128. movq mm3, mm2 /* make a copy of the pixel */
  1129. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1130. /* mm4, mm5 --- au + ad + al + ar */
  1131. punpcklbw mm2, mm7 /* left four pixels */
  1132. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1133. punpckhbw mm3, mm7 /* right four pixels */
  1134. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1135. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1136. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1137. paddw mm0, mm2 /* sum */
  1138. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1139. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1140. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1141. movq mm7, mm6 /* {64, 64, 64, 64} */
  1142. paddw mm0, mm6 /* sum+B */
  1143. paddw mm1, mm3 /* sum */
  1144. psllw mm7, 8 /* {16384, .. } */
  1145. paddw mm0, mm7 /* clamping */
  1146. paddw mm1, mm6 /* sum+B */
  1147. paddw mm1, mm7 /* clamping */
  1148. psubusw mm0, mm7 /* clamping */
  1149. psubusw mm1, mm7 /* clamping */
  1150. psrlw mm0, 7 /* (sum+B)>>7 */
  1151. psrlw mm1, 7 /* (sum+B)>>7 */
  1152. packuswb mm0, mm1 /* pack to 8 bytes */
  1153. movq [edi], mm0 /* write to destination */
  1154. add esi, ecx /* Src += Pitch */
  1155. add edi, ecx /* Des += Pitch */
  1156. add eax, 16 /* UDPointer += 8 */
  1157. add ebx, 16 /* LPointer +=8 */
  1158. /* Second Row */
  1159. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1160. pxor mm7, mm7 /* clear mm7 */
  1161. movq mm1, mm0 /* make a copy of mm0 */
  1162. punpcklbw mm0, mm7 /* lower four pixels */
  1163. movq mm4, [eax] /* au */
  1164. punpckhbw mm1, mm7 /* high four pixels */
  1165. movq mm5, [eax+8] /* au */
  1166. pmullw mm0, mm4 /* pu*au */
  1167. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1168. pmullw mm1, mm5 /* pu*au */
  1169. movq mm3, mm2 /* make a copy of mm2 */
  1170. punpcklbw mm2, mm7 /* lower four */
  1171. movq mm6, [eax+16] /* ad */
  1172. punpckhbw mm3, mm7 /* higher four */
  1173. paddw mm4, mm6 /* au+ad */
  1174. pmullw mm2, mm6 /* au*pu+ad*pd */
  1175. movq mm6, [eax+24] /* ad */
  1176. paddw mm0, mm2
  1177. paddw mm5, mm6 /* au+ad */
  1178. pmullw mm3, mm6 /* ad*pd */
  1179. movq mm2, [esi-1] /* pixel to the left */
  1180. paddw mm1, mm3 /* au*pu+ad*pd */
  1181. movq mm3, mm2 /* make a copy of mm2 */
  1182. punpcklbw mm2, mm7 /* four left pixels */
  1183. movq mm6, [ebx] /* al */
  1184. punpckhbw mm3, mm7 /* four right pixels */
  1185. paddw mm4, mm6 /* au + ad + al */
  1186. pmullw mm2, mm6 /* pl * al */
  1187. movq mm6, [ebx+8] /* al */
  1188. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1189. paddw mm5, mm6 /* au+ad+al */
  1190. pmullw mm3, mm6 /* al*pl */
  1191. movq mm2, [esi+1] /* pixel to the right */
  1192. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1193. movq mm3, mm2 /* make a copy of mm2 */
  1194. punpcklbw mm2, mm7 /* four left pixels */
  1195. movq mm6, [ebx+128] /* ar */
  1196. punpckhbw mm3, mm7 /* four right pixels */
  1197. paddw mm4, mm6 /* au + ad + al + ar */
  1198. pmullw mm2, mm6 /* pr * ar */
  1199. movq mm6, [ebx+136] /* ar */
  1200. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1201. paddw mm5, mm6 /* au+ad+al+ar */
  1202. pmullw mm3, mm6 /* ar*pr */
  1203. movq mm2, [esi] /* p */
  1204. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1205. movq mm3, mm2 /* make a copy of the pixel */
  1206. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1207. /* mm4, mm5 --- au + ad + al + ar */
  1208. punpcklbw mm2, mm7 /* left four pixels */
  1209. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1210. punpckhbw mm3, mm7 /* right four pixels */
  1211. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1212. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1213. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1214. paddw mm0, mm2 /* sum */
  1215. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1216. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1217. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1218. movq mm7, mm6 /* {64, 64, 64, 64} */
  1219. paddw mm0, mm6 /* sum+B */
  1220. paddw mm1, mm3 /* sum */
  1221. psllw mm7, 8 /* {16384, .. } */
  1222. paddw mm0, mm7 /* clamping */
  1223. paddw mm1, mm6 /* sum+B */
  1224. paddw mm1, mm7 /* clamping */
  1225. psubusw mm0, mm7 /* clamping */
  1226. psubusw mm1, mm7 /* clamping */
  1227. psrlw mm0, 7 /* (sum+B)>>7 */
  1228. psrlw mm1, 7 /* (sum+B)>>7 */
  1229. packuswb mm0, mm1 /* pack to 8 bytes */
  1230. movq [edi], mm0 /* write to destination */
  1231. add esi, ecx /* Src += Pitch */
  1232. add edi, ecx /* Des += Pitch */
  1233. add eax, 16 /* UDPointer += 8 */
  1234. add ebx, 16 /* LPointer +=8 */
  1235. /* Third Row */
  1236. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1237. pxor mm7, mm7 /* clear mm7 */
  1238. movq mm1, mm0 /* make a copy of mm0 */
  1239. punpcklbw mm0, mm7 /* lower four pixels */
  1240. movq mm4, [eax] /* au */
  1241. punpckhbw mm1, mm7 /* high four pixels */
  1242. movq mm5, [eax+8] /* au */
  1243. pmullw mm0, mm4 /* pu*au */
  1244. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1245. pmullw mm1, mm5 /* pu*au */
  1246. movq mm3, mm2 /* make a copy of mm2 */
  1247. punpcklbw mm2, mm7 /* lower four */
  1248. movq mm6, [eax+16] /* ad */
  1249. punpckhbw mm3, mm7 /* higher four */
  1250. paddw mm4, mm6 /* au+ad */
  1251. pmullw mm2, mm6 /* au*pu+ad*pd */
  1252. movq mm6, [eax+24] /* ad */
  1253. paddw mm0, mm2
  1254. paddw mm5, mm6 /* au+ad */
  1255. pmullw mm3, mm6 /* ad*pd */
  1256. movq mm2, [esi-1] /* pixel to the left */
  1257. paddw mm1, mm3 /* au*pu+ad*pd */
  1258. movq mm3, mm2 /* make a copy of mm2 */
  1259. punpcklbw mm2, mm7 /* four left pixels */
  1260. movq mm6, [ebx] /* al */
  1261. punpckhbw mm3, mm7 /* four right pixels */
  1262. paddw mm4, mm6 /* au + ad + al */
  1263. pmullw mm2, mm6 /* pl * al */
  1264. movq mm6, [ebx+8] /* al */
  1265. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1266. paddw mm5, mm6 /* au+ad+al */
  1267. pmullw mm3, mm6 /* al*pl */
  1268. movq mm2, [esi+1] /* pixel to the right */
  1269. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1270. movq mm3, mm2 /* make a copy of mm2 */
  1271. punpcklbw mm2, mm7 /* four left pixels */
  1272. movq mm6, [ebx+128] /* ar */
  1273. punpckhbw mm3, mm7 /* four right pixels */
  1274. paddw mm4, mm6 /* au + ad + al + ar */
  1275. pmullw mm2, mm6 /* pr * ar */
  1276. movq mm6, [ebx+136] /* ar */
  1277. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1278. paddw mm5, mm6 /* au+ad+al+ar */
  1279. pmullw mm3, mm6 /* ar*pr */
  1280. movq mm2, [esi] /* p */
  1281. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1282. movq mm3, mm2 /* make a copy of the pixel */
  1283. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1284. /* mm4, mm5 --- au + ad + al + ar */
  1285. punpcklbw mm2, mm7 /* left four pixels */
  1286. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1287. punpckhbw mm3, mm7 /* right four pixels */
  1288. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1289. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1290. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1291. paddw mm0, mm2 /* sum */
  1292. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1293. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1294. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1295. movq mm7, mm6 /* {64, 64, 64, 64} */
  1296. paddw mm0, mm6 /* sum+B */
  1297. paddw mm1, mm3 /* sum */
  1298. psllw mm7, 8 /* {16384, .. } */
  1299. paddw mm0, mm7 /* clamping */
  1300. paddw mm1, mm6 /* sum+B */
  1301. paddw mm1, mm7 /* clamping */
  1302. psubusw mm0, mm7 /* clamping */
  1303. psubusw mm1, mm7 /* clamping */
  1304. psrlw mm0, 7 /* (sum+B)>>7 */
  1305. psrlw mm1, 7 /* (sum+B)>>7 */
  1306. packuswb mm0, mm1 /* pack to 8 bytes */
  1307. movq [edi], mm0 /* write to destination */
  1308. add esi, ecx /* Src += Pitch */
  1309. add edi, ecx /* Des += Pitch */
  1310. add eax, 16 /* UDPointer += 8 */
  1311. add ebx, 16 /* LPointer +=8 */
  1312. /* Fourth Row */
  1313. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1314. pxor mm7, mm7 /* clear mm7 */
  1315. movq mm1, mm0 /* make a copy of mm0 */
  1316. punpcklbw mm0, mm7 /* lower four pixels */
  1317. movq mm4, [eax] /* au */
  1318. punpckhbw mm1, mm7 /* high four pixels */
  1319. movq mm5, [eax+8] /* au */
  1320. pmullw mm0, mm4 /* pu*au */
  1321. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1322. pmullw mm1, mm5 /* pu*au */
  1323. movq mm3, mm2 /* make a copy of mm2 */
  1324. punpcklbw mm2, mm7 /* lower four */
  1325. movq mm6, [eax+16] /* ad */
  1326. punpckhbw mm3, mm7 /* higher four */
  1327. paddw mm4, mm6 /* au+ad */
  1328. pmullw mm2, mm6 /* au*pu+ad*pd */
  1329. movq mm6, [eax+24] /* ad */
  1330. paddw mm0, mm2
  1331. paddw mm5, mm6 /* au+ad */
  1332. pmullw mm3, mm6 /* ad*pd */
  1333. movq mm2, [esi-1] /* pixel to the left */
  1334. paddw mm1, mm3 /* au*pu+ad*pd */
  1335. movq mm3, mm2 /* make a copy of mm2 */
  1336. punpcklbw mm2, mm7 /* four left pixels */
  1337. movq mm6, [ebx] /* al */
  1338. punpckhbw mm3, mm7 /* four right pixels */
  1339. paddw mm4, mm6 /* au + ad + al */
  1340. pmullw mm2, mm6 /* pl * al */
  1341. movq mm6, [ebx+8] /* al */
  1342. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1343. paddw mm5, mm6 /* au+ad+al */
  1344. pmullw mm3, mm6 /* al*pl */
  1345. movq mm2, [esi+1] /* pixel to the right */
  1346. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1347. movq mm3, mm2 /* make a copy of mm2 */
  1348. punpcklbw mm2, mm7 /* four left pixels */
  1349. movq mm6, [ebx+128] /* ar */
  1350. punpckhbw mm3, mm7 /* four right pixels */
  1351. paddw mm4, mm6 /* au + ad + al + ar */
  1352. pmullw mm2, mm6 /* pr * ar */
  1353. movq mm6, [ebx+136] /* ar */
  1354. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1355. paddw mm5, mm6 /* au+ad+al+ar */
  1356. pmullw mm3, mm6 /* ar*pr */
  1357. movq mm2, [esi] /* p */
  1358. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1359. movq mm3, mm2 /* make a copy of the pixel */
  1360. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1361. /* mm4, mm5 --- au + ad + al + ar */
  1362. punpcklbw mm2, mm7 /* left four pixels */
  1363. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1364. punpckhbw mm3, mm7 /* right four pixels */
  1365. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1366. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1367. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1368. paddw mm0, mm2 /* sum */
  1369. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1370. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1371. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1372. movq mm7, mm6 /* {64, 64, 64, 64} */
  1373. paddw mm0, mm6 /* sum+B */
  1374. paddw mm1, mm3 /* sum */
  1375. psllw mm7, 8 /* {16384, .. } */
  1376. paddw mm0, mm7 /* clamping */
  1377. paddw mm1, mm6 /* sum+B */
  1378. paddw mm1, mm7 /* clamping */
  1379. psubusw mm0, mm7 /* clamping */
  1380. psubusw mm1, mm7 /* clamping */
  1381. psrlw mm0, 7 /* (sum+B)>>7 */
  1382. psrlw mm1, 7 /* (sum+B)>>7 */
  1383. packuswb mm0, mm1 /* pack to 8 bytes */
  1384. movq [edi], mm0 /* write to destination */
  1385. add esi, ecx /* Src += Pitch */
  1386. add edi, ecx /* Des += Pitch */
  1387. add eax, 16 /* UDPointer += 8 */
  1388. add ebx, 16 /* LPointer +=8 */
  1389. /* Fifth Row */
  1390. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1391. pxor mm7, mm7 /* clear mm7 */
  1392. movq mm1, mm0 /* make a copy of mm0 */
  1393. punpcklbw mm0, mm7 /* lower four pixels */
  1394. movq mm4, [eax] /* au */
  1395. punpckhbw mm1, mm7 /* high four pixels */
  1396. movq mm5, [eax+8] /* au */
  1397. pmullw mm0, mm4 /* pu*au */
  1398. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1399. pmullw mm1, mm5 /* pu*au */
  1400. movq mm3, mm2 /* make a copy of mm2 */
  1401. punpcklbw mm2, mm7 /* lower four */
  1402. movq mm6, [eax+16] /* ad */
  1403. punpckhbw mm3, mm7 /* higher four */
  1404. paddw mm4, mm6 /* au+ad */
  1405. pmullw mm2, mm6 /* au*pu+ad*pd */
  1406. movq mm6, [eax+24] /* ad */
  1407. paddw mm0, mm2
  1408. paddw mm5, mm6 /* au+ad */
  1409. pmullw mm3, mm6 /* ad*pd */
  1410. movq mm2, [esi-1] /* pixel to the left */
  1411. paddw mm1, mm3 /* au*pu+ad*pd */
  1412. movq mm3, mm2 /* make a copy of mm2 */
  1413. punpcklbw mm2, mm7 /* four left pixels */
  1414. movq mm6, [ebx] /* al */
  1415. punpckhbw mm3, mm7 /* four right pixels */
  1416. paddw mm4, mm6 /* au + ad + al */
  1417. pmullw mm2, mm6 /* pl * al */
  1418. movq mm6, [ebx+8] /* al */
  1419. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1420. paddw mm5, mm6 /* au+ad+al */
  1421. pmullw mm3, mm6 /* al*pl */
  1422. movq mm2, [esi+1] /* pixel to the right */
  1423. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1424. movq mm3, mm2 /* make a copy of mm2 */
  1425. punpcklbw mm2, mm7 /* four left pixels */
  1426. movq mm6, [ebx+128] /* ar */
  1427. punpckhbw mm3, mm7 /* four right pixels */
  1428. paddw mm4, mm6 /* au + ad + al + ar */
  1429. pmullw mm2, mm6 /* pr * ar */
  1430. movq mm6, [ebx+136] /* ar */
  1431. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1432. paddw mm5, mm6 /* au+ad+al+ar */
  1433. pmullw mm3, mm6 /* ar*pr */
  1434. movq mm2, [esi] /* p */
  1435. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1436. movq mm3, mm2 /* make a copy of the pixel */
  1437. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1438. /* mm4, mm5 --- au + ad + al + ar */
  1439. punpcklbw mm2, mm7 /* left four pixels */
  1440. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1441. punpckhbw mm3, mm7 /* right four pixels */
  1442. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1443. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1444. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1445. paddw mm0, mm2 /* sum */
  1446. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1447. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1448. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1449. movq mm7, mm6 /* {64, 64, 64, 64} */
  1450. paddw mm0, mm6 /* sum+B */
  1451. paddw mm1, mm3 /* sum */
  1452. psllw mm7, 8 /* {16384, .. } */
  1453. paddw mm0, mm7 /* clamping */
  1454. paddw mm1, mm6 /* sum+B */
  1455. paddw mm1, mm7 /* clamping */
  1456. psubusw mm0, mm7 /* clamping */
  1457. psubusw mm1, mm7 /* clamping */
  1458. psrlw mm0, 7 /* (sum+B)>>7 */
  1459. psrlw mm1, 7 /* (sum+B)>>7 */
  1460. packuswb mm0, mm1 /* pack to 8 bytes */
  1461. movq [edi], mm0 /* write to destination */
  1462. add esi, ecx /* Src += Pitch */
  1463. add edi, ecx /* Des += Pitch */
  1464. add eax, 16 /* UDPointer += 8 */
  1465. add ebx, 16 /* LPointer +=8 */
  1466. /* Sixth Row */
  1467. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1468. pxor mm7, mm7 /* clear mm7 */
  1469. movq mm1, mm0 /* make a copy of mm0 */
  1470. punpcklbw mm0, mm7 /* lower four pixels */
  1471. movq mm4, [eax] /* au */
  1472. punpckhbw mm1, mm7 /* high four pixels */
  1473. movq mm5, [eax+8] /* au */
  1474. pmullw mm0, mm4 /* pu*au */
  1475. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1476. pmullw mm1, mm5 /* pu*au */
  1477. movq mm3, mm2 /* make a copy of mm2 */
  1478. punpcklbw mm2, mm7 /* lower four */
  1479. movq mm6, [eax+16] /* ad */
  1480. punpckhbw mm3, mm7 /* higher four */
  1481. paddw mm4, mm6 /* au+ad */
  1482. pmullw mm2, mm6 /* au*pu+ad*pd */
  1483. movq mm6, [eax+24] /* ad */
  1484. paddw mm0, mm2
  1485. paddw mm5, mm6 /* au+ad */
  1486. pmullw mm3, mm6 /* ad*pd */
  1487. movq mm2, [esi-1] /* pixel to the left */
  1488. paddw mm1, mm3 /* au*pu+ad*pd */
  1489. movq mm3, mm2 /* make a copy of mm2 */
  1490. punpcklbw mm2, mm7 /* four left pixels */
  1491. movq mm6, [ebx] /* al */
  1492. punpckhbw mm3, mm7 /* four right pixels */
  1493. paddw mm4, mm6 /* au + ad + al */
  1494. pmullw mm2, mm6 /* pl * al */
  1495. movq mm6, [ebx+8] /* al */
  1496. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1497. paddw mm5, mm6 /* au+ad+al */
  1498. pmullw mm3, mm6 /* al*pl */
  1499. movq mm2, [esi+1] /* pixel to the right */
  1500. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1501. movq mm3, mm2 /* make a copy of mm2 */
  1502. punpcklbw mm2, mm7 /* four left pixels */
  1503. movq mm6, [ebx+128] /* ar */
  1504. punpckhbw mm3, mm7 /* four right pixels */
  1505. paddw mm4, mm6 /* au + ad + al + ar */
  1506. pmullw mm2, mm6 /* pr * ar */
  1507. movq mm6, [ebx+136] /* ar */
  1508. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1509. paddw mm5, mm6 /* au+ad+al+ar */
  1510. pmullw mm3, mm6 /* ar*pr */
  1511. movq mm2, [esi] /* p */
  1512. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1513. movq mm3, mm2 /* make a copy of the pixel */
  1514. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1515. /* mm4, mm5 --- au + ad + al + ar */
  1516. punpcklbw mm2, mm7 /* left four pixels */
  1517. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1518. punpckhbw mm3, mm7 /* right four pixels */
  1519. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1520. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1521. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1522. paddw mm0, mm2 /* sum */
  1523. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1524. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1525. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1526. movq mm7, mm6 /* {64, 64, 64, 64} */
  1527. paddw mm0, mm6 /* sum+B */
  1528. paddw mm1, mm3 /* sum */
  1529. psllw mm7, 8 /* {16384, .. } */
  1530. paddw mm0, mm7 /* clamping */
  1531. paddw mm1, mm6 /* sum+B */
  1532. paddw mm1, mm7 /* clamping */
  1533. psubusw mm0, mm7 /* clamping */
  1534. psubusw mm1, mm7 /* clamping */
  1535. psrlw mm0, 7 /* (sum+B)>>7 */
  1536. psrlw mm1, 7 /* (sum+B)>>7 */
  1537. packuswb mm0, mm1 /* pack to 8 bytes */
  1538. movq [edi], mm0 /* write to destination */
  1539. add esi, ecx /* Src += Pitch */
  1540. add edi, ecx /* Des += Pitch */
  1541. add eax, 16 /* UDPointer += 8 */
  1542. add ebx, 16 /* LPointer +=8 */
  1543. /* Seventh Row */
  1544. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1545. pxor mm7, mm7 /* clear mm7 */
  1546. movq mm1, mm0 /* make a copy of mm0 */
  1547. punpcklbw mm0, mm7 /* lower four pixels */
  1548. movq mm4, [eax] /* au */
  1549. punpckhbw mm1, mm7 /* high four pixels */
  1550. movq mm5, [eax+8] /* au */
  1551. pmullw mm0, mm4 /* pu*au */
  1552. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1553. pmullw mm1, mm5 /* pu*au */
  1554. movq mm3, mm2 /* make a copy of mm2 */
  1555. punpcklbw mm2, mm7 /* lower four */
  1556. movq mm6, [eax+16] /* ad */
  1557. punpckhbw mm3, mm7 /* higher four */
  1558. paddw mm4, mm6 /* au+ad */
  1559. pmullw mm2, mm6 /* au*pu+ad*pd */
  1560. movq mm6, [eax+24] /* ad */
  1561. paddw mm0, mm2
  1562. paddw mm5, mm6 /* au+ad */
  1563. pmullw mm3, mm6 /* ad*pd */
  1564. movq mm2, [esi-1] /* pixel to the left */
  1565. paddw mm1, mm3 /* au*pu+ad*pd */
  1566. movq mm3, mm2 /* make a copy of mm2 */
  1567. punpcklbw mm2, mm7 /* four left pixels */
  1568. movq mm6, [ebx] /* al */
  1569. punpckhbw mm3, mm7 /* four right pixels */
  1570. paddw mm4, mm6 /* au + ad + al */
  1571. pmullw mm2, mm6 /* pl * al */
  1572. movq mm6, [ebx+8] /* al */
  1573. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1574. paddw mm5, mm6 /* au+ad+al */
  1575. pmullw mm3, mm6 /* al*pl */
  1576. movq mm2, [esi+1] /* pixel to the right */
  1577. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1578. movq mm3, mm2 /* make a copy of mm2 */
  1579. punpcklbw mm2, mm7 /* four left pixels */
  1580. movq mm6, [ebx+128] /* ar */
  1581. punpckhbw mm3, mm7 /* four right pixels */
  1582. paddw mm4, mm6 /* au + ad + al + ar */
  1583. pmullw mm2, mm6 /* pr * ar */
  1584. movq mm6, [ebx+136] /* ar */
  1585. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1586. paddw mm5, mm6 /* au+ad+al+ar */
  1587. pmullw mm3, mm6 /* ar*pr */
  1588. movq mm2, [esi] /* p */
  1589. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1590. movq mm3, mm2 /* make a copy of the pixel */
  1591. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1592. /* mm4, mm5 --- au + ad + al + ar */
  1593. punpcklbw mm2, mm7 /* left four pixels */
  1594. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1595. punpckhbw mm3, mm7 /* right four pixels */
  1596. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1597. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1598. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1599. paddw mm0, mm2 /* sum */
  1600. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1601. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1602. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1603. movq mm7, mm6 /* {64, 64, 64, 64} */
  1604. paddw mm0, mm6 /* sum+B */
  1605. paddw mm1, mm3 /* sum */
  1606. psllw mm7, 8 /* {16384, .. } */
  1607. paddw mm0, mm7 /* clamping */
  1608. paddw mm1, mm6 /* sum+B */
  1609. paddw mm1, mm7 /* clamping */
  1610. psubusw mm0, mm7 /* clamping */
  1611. psubusw mm1, mm7 /* clamping */
  1612. psrlw mm0, 7 /* (sum+B)>>7 */
  1613. psrlw mm1, 7 /* (sum+B)>>7 */
  1614. packuswb mm0, mm1 /* pack to 8 bytes */
  1615. movq [edi], mm0 /* write to destination */
  1616. add esi, ecx /* Src += Pitch */
  1617. add edi, ecx /* Des += Pitch */
  1618. add eax, 16 /* UDPointer += 8 */
  1619. add ebx, 16 /* LPointer +=8 */
  1620. /* Eighth Row */
  1621. movq mm0, [esi+edx] /* mm0 = Pixels above */
  1622. pxor mm7, mm7 /* clear mm7 */
  1623. movq mm1, mm0 /* make a copy of mm0 */
  1624. punpcklbw mm0, mm7 /* lower four pixels */
  1625. movq mm4, [eax] /* au */
  1626. punpckhbw mm1, mm7 /* high four pixels */
  1627. movq mm5, [eax+8] /* au */
  1628. pmullw mm0, mm4 /* pu*au */
  1629. movq mm2, [esi+ecx] /* mm2 = pixels below */
  1630. pmullw mm1, mm5 /* pu*au */
  1631. movq mm3, mm2 /* make a copy of mm2 */
  1632. punpcklbw mm2, mm7 /* lower four */
  1633. movq mm6, [eax+16] /* ad */
  1634. punpckhbw mm3, mm7 /* higher four */
  1635. paddw mm4, mm6 /* au+ad */
  1636. pmullw mm2, mm6 /* au*pu+ad*pd */
  1637. movq mm6, [eax+24] /* ad */
  1638. paddw mm0, mm2
  1639. paddw mm5, mm6 /* au+ad */
  1640. pmullw mm3, mm6 /* ad*pd */
  1641. movq mm2, [esi-1] /* pixel to the left */
  1642. paddw mm1, mm3 /* au*pu+ad*pd */
  1643. movq mm3, mm2 /* make a copy of mm2 */
  1644. punpcklbw mm2, mm7 /* four left pixels */
  1645. movq mm6, [ebx] /* al */
  1646. punpckhbw mm3, mm7 /* four right pixels */
  1647. paddw mm4, mm6 /* au + ad + al */
  1648. pmullw mm2, mm6 /* pl * al */
  1649. movq mm6, [ebx+8] /* al */
  1650. paddw mm0, mm2 /* au*pu+ad*pd+al*pl */
  1651. paddw mm5, mm6 /* au+ad+al */
  1652. pmullw mm3, mm6 /* al*pl */
  1653. movq mm2, [esi+1] /* pixel to the right */
  1654. paddw mm1, mm3 /* au*pu+ad*pd+al*pl */
  1655. movq mm3, mm2 /* make a copy of mm2 */
  1656. punpcklbw mm2, mm7 /* four left pixels */
  1657. movq mm6, [ebx+128] /* ar */
  1658. punpckhbw mm3, mm7 /* four right pixels */
  1659. paddw mm4, mm6 /* au + ad + al + ar */
  1660. pmullw mm2, mm6 /* pr * ar */
  1661. movq mm6, [ebx+136] /* ar */
  1662. paddw mm0, mm2 /* au*pu+ad*pd+al*pl+pr*ar */
  1663. paddw mm5, mm6 /* au+ad+al+ar */
  1664. pmullw mm3, mm6 /* ar*pr */
  1665. movq mm2, [esi] /* p */
  1666. paddw mm1, mm3 /* au*pu+ad*pd+al*pl+ar*pr */
  1667. movq mm3, mm2 /* make a copy of the pixel */
  1668. /* mm0, mm1 --- au*pu+ad*pd+al*pl+ar*pr */
  1669. /* mm4, mm5 --- au + ad + al + ar */
  1670. punpcklbw mm2, mm7 /* left four pixels */
  1671. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1672. punpckhbw mm3, mm7 /* right four pixels */
  1673. psubw mm6, mm4 /* 128-(au+ad+al+ar) */
  1674. pmullw mm2, mm6 /* p*(128-(au+ad+al+ar)) */
  1675. movq mm6, Four128s /* 0080 0080 0080 0080 */
  1676. paddw mm0, mm2 /* sum */
  1677. psubw mm6, mm5 /* 128-(au+ad+al+ar) */
  1678. pmullw mm3, mm6 /* p*(128-(au+ad+al+ar)) */
  1679. movq mm6, Four64s /* {64, 64, 64, 64 } */
  1680. movq mm7, mm6 /* {64, 64, 64, 64} */
  1681. paddw mm0, mm6 /* sum+B */
  1682. paddw mm1, mm3 /* sum */
  1683. psllw mm7, 8 /* {16384, .. } */
  1684. paddw mm0, mm7 /* clamping */
  1685. paddw mm1, mm6 /* sum+B */
  1686. paddw mm1, mm7 /* clamping */
  1687. psubusw mm0, mm7 /* clamping */
  1688. psubusw mm1, mm7 /* clamping */
  1689. psrlw mm0, 7 /* (sum+B)>>7 */
  1690. psrlw mm1, 7 /* (sum+B)>>7 */
  1691. packuswb mm0, mm1 /* pack to 8 bytes */
  1692. movq [edi], mm0 /* write to destination */
  1693. pop ebx
  1694. pop eax
  1695. pop edx
  1696. pop ecx
  1697. pop edi
  1698. pop esi
  1699. }
  1700. }