prediction.asm 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626
  1. .686
  2. .XMM
  3. .model FLAT
  4. copy_image_data_16x16_stride@OptimizedFunctions = 32
  5. dec_picture@VideoParameters = 698192
  6. p_Slice@MacroBlock = 0
  7. plane_images@StorablePicture = 158512
  8. mb_rec@Slice = 1696
  9. mb_pred@Slice = 928
  10. cof@Slice = 2464
  11. CONST SEGMENT
  12. align 16
  13. const32 DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H
  14. CONST ENDS
  15. ;
  16. ;
  17. ;
  18. ;
  19. PUBLIC _weighted_bi_prediction4x4
  20. _TEXT SEGMENT
  21. mb_pred = 4
  22. block_l0 = 8
  23. wp_scale_l0 = 12
  24. wp_scale_l1 = 16
  25. wp_offset = 20
  26. weight_denom = 24
  27. _weighted_bi_prediction4x4 PROC ; COMDAT
  28. mov eax, DWORD PTR weight_denom[esp]
  29. pxor mm0, mm0
  30. pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0
  31. test eax, eax
  32. pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0
  33. pshufw mm3, MMWORD PTR wp_offset[esp], 0
  34. jle BI_PRED4x4@LEFT_SHIFT
  35. movd mm4, eax
  36. lea ecx, DWORD PTR [eax-1] ;
  37. mov edx, 1
  38. shl edx, cl
  39. movd mm5, edx
  40. mov eax, mb_pred[esp]
  41. mov edx, block_l0[esp]
  42. pshufw mm5, mm5, 0
  43. movd mm6, DWORD PTR 0[edx] ; block_l0
  44. movd mm7, DWORD PTR 0[eax] ; mb_pred
  45. punpcklbw mm6, mm0
  46. punpcklbw mm7, mm0
  47. pmullw mm6, mm1
  48. pmullw mm7, mm2
  49. paddw mm6, mm7
  50. movd mm7, DWORD PTR 16[eax] ; mb_pred
  51. paddw mm6, mm5
  52. psraw mm6, mm4
  53. paddw mm6, mm3
  54. packuswb mm6, mm6
  55. movd DWORD PTR 0[eax], mm6
  56. movd mm6, DWORD PTR 16[edx] ; block_l0
  57. punpcklbw mm6, mm0
  58. punpcklbw mm7, mm0
  59. pmullw mm6, mm1
  60. pmullw mm7, mm2
  61. paddw mm6, mm7
  62. movd mm7, DWORD PTR 32[eax] ; mb_pred
  63. paddw mm6, mm5
  64. psraw mm6, mm4
  65. paddw mm6, mm3
  66. packuswb mm6, mm6
  67. movd DWORD PTR 16[eax], mm6
  68. movd mm6, DWORD PTR 32[edx] ; block_l0
  69. punpcklbw mm6, mm0
  70. punpcklbw mm7, mm0
  71. pmullw mm6, mm1
  72. pmullw mm7, mm2
  73. paddw mm6, mm7
  74. movd mm7, DWORD PTR 48[eax] ; mb_pred
  75. paddw mm6, mm5
  76. psraw mm6, mm4
  77. paddw mm6, mm3
  78. packuswb mm6, mm6
  79. movd DWORD PTR 0[eax], mm6
  80. movd mm6, DWORD PTR 48[edx] ; block_l0
  81. punpcklbw mm6, mm0
  82. punpcklbw mm7, mm0
  83. pmullw mm6, mm1
  84. pmullw mm7, mm2
  85. paddw mm6, mm7
  86. paddw mm6, mm5
  87. psraw mm6, mm4
  88. paddw mm6, mm3
  89. packuswb mm6, mm6
  90. movd DWORD PTR 48[eax], mm6
  91. ret 0
  92. BI_PRED4x4@LEFT_SHIFT:
  93. neg eax
  94. movd mm4, eax
  95. mov eax, mb_pred[esp]
  96. mov edx, block_l0[esp]
  97. movd mm6, DWORD PTR 0[edx] ; block_l0
  98. movd mm7, DWORD PTR 0[eax] ; mb_pred
  99. punpcklbw mm6, mm0
  100. punpcklbw mm7, mm0
  101. pmullw mm6, mm1
  102. pmullw mm7, mm2
  103. paddw mm6, mm7
  104. movd mm7, DWORD PTR 16[eax] ; mb_pred
  105. psllw mm6, mm4
  106. paddw mm6, mm3
  107. packuswb mm6, mm6
  108. movd DWORD PTR 0[eax], mm6
  109. movd mm6, DWORD PTR 16[edx] ; block_l0
  110. punpcklbw mm6, mm0
  111. punpcklbw mm7, mm0
  112. pmullw mm6, mm1
  113. pmullw mm7, mm2
  114. paddw mm6, mm7
  115. movd mm7, DWORD PTR 32[eax] ; mb_pred
  116. psllw mm6, mm4
  117. paddw mm6, mm3
  118. packuswb mm6, mm6
  119. movd DWORD PTR 16[eax], mm6
  120. movd mm6, DWORD PTR 32[edx] ; block_l0
  121. punpcklbw mm6, mm0
  122. punpcklbw mm7, mm0
  123. pmullw mm6, mm1
  124. pmullw mm7, mm2
  125. paddw mm6, mm7
  126. movd mm7, DWORD PTR 48[eax] ; mb_pred
  127. psllw mm6, mm4
  128. paddw mm6, mm3
  129. packuswb mm6, mm6
  130. movd DWORD PTR 0[eax], mm6
  131. movd mm6, DWORD PTR 48[edx] ; block_l0
  132. punpcklbw mm6, mm0
  133. punpcklbw mm7, mm0
  134. pmullw mm6, mm1
  135. pmullw mm7, mm2
  136. paddw mm6, mm7
  137. psllw mm6, mm4
  138. paddw mm6, mm3
  139. packuswb mm6, mm6
  140. movd DWORD PTR 48[eax], mm6
  141. ret 0
  142. _weighted_bi_prediction4x4 ENDP
  143. _TEXT ENDS
  144. PUBLIC _itrans4x4_mmx
  145. _TEXT SEGMENT
  146. _tblock$ = 4 ; size = 4
  147. _mb_pred$ = 8 ; size = 4
  148. _mb_rec$ = 12 ; size = 4
  149. _pos_x$ = 16 ; size = 4
  150. _pos_y$ = 20 ; size = 4
  151. _itrans4x4_mmx PROC ; COMDAT
  152. mov edx, DWORD PTR _pos_y$[esp]
  153. shl edx, 4
  154. add edx, DWORD PTR _pos_x$[esp]
  155. mov eax, DWORD PTR _tblock$[esp]
  156. mov ecx, DWORD PTR _mb_pred$[esp]
  157. add ecx, edx
  158. add edx, DWORD PTR _mb_rec$[esp]
  159. _itrans4x4_mmx_direct PROC ; COMDAT
  160. ; load 4x4 matrix
  161. movq mm0, MMWORD PTR 0[eax]
  162. movq mm1, MMWORD PTR 8[eax]
  163. movq mm2, MMWORD PTR 16[eax]
  164. movq mm3, MMWORD PTR 24[eax]
  165. ; rotate 4x4 matrix
  166. movq mm4, mm0 ; p0 = mm4 (copy)
  167. punpcklwd mm0, mm2 ; r0 = mm0
  168. punpckhwd mm4, mm2 ; r2 = mm4
  169. movq mm5, mm1 ; p1 = mm5 (copy)
  170. punpcklwd mm1, mm3 ; r1 = mm1
  171. punpckhwd mm5, mm3 ; r3 = mm5
  172. movq mm6, mm0 ; r0 = mm6 (copy)
  173. punpcklwd mm0, mm1 ; t0 = mm0
  174. punpckhwd mm6, mm1 ; t1 = mm6
  175. movq mm1, mm4 ; r2 = mm1 (copy)
  176. punpcklwd mm1, mm5 ; t2 = mm1
  177. punpckhwd mm4, mm5 ; t3 = mm4
  178. movq mm2, mm0 ; mm2 = t0 (copy)
  179. paddw mm0, mm1 ; mm0 = p0
  180. psubw mm2, mm1 ; mm2 = p1, mm1 available
  181. movq mm5, mm6 ; mm5 = t1 (copy)
  182. psraw mm5, 1 ; mm5 = (t1 >> 1)
  183. psubw mm5, mm4 ; mm5 = p2
  184. psraw mm4, 1 ; mm4 = (t3 >> 1)
  185. paddw mm6, mm4 ; mm6 = p3
  186. movq mm3, mm0 ; mm3 = p0 (copy)
  187. paddw mm0, mm6 ; mm0 = r0
  188. movq mm1, mm2 ; mm1 = p1 (copy)
  189. paddw mm1, mm5 ; mm1 = r1
  190. psubw mm2, mm5 ; mm2 = r2, mm5 available
  191. psubw mm3, mm6 ; mm3 = r3
  192. ; rotate 4x4 matrix to set up for vertical
  193. movq mm4, mm0 ; r0 = mm4 (copy)
  194. punpcklwd mm0, mm2 ; p0 = mm0
  195. punpckhwd mm4, mm2 ; p2 = mm4
  196. movq mm5, mm1 ; r1 = mm5 (copy)
  197. punpcklwd mm1, mm3 ; p1 = mm1
  198. punpckhwd mm5, mm3 ; p3 = mm5
  199. movq mm6, mm0 ; p0 = mm6 (copy)
  200. punpcklwd mm0, mm1 ; t0 = mm0
  201. punpckhwd mm6, mm1 ; t1 = mm6
  202. movq mm1, mm4 ; p2 = mm1 (copy)
  203. punpcklwd mm1, mm5 ; t2 = mm1
  204. punpckhwd mm4, mm5 ; t3 = mm4
  205. movq mm2, mm0 ; mm2 = t0 (copy)
  206. paddw mm0, mm1 ; mm0 = p0
  207. psubw mm2, mm1 ; mm2 = p1, mm1 available
  208. movq mm5, mm6 ; mm5 = t1 (copy)
  209. psraw mm5, 1 ; mm5 = (t1 >> 1)
  210. psubw mm5, mm4 ; mm5 = p2
  211. psraw mm4, 1 ; mm4 = (t3 >> 1)
  212. paddw mm6, mm4 ; mm6 = p3
  213. movq mm3, mm0 ; mm3 = p0 (copy)
  214. paddw mm0, mm6 ; mm0 = r0
  215. movq mm1, mm2 ; mm1 = p1 (copy)
  216. paddw mm1, mm5 ; mm1 = r1
  217. psubw mm2, mm5 ; mm2 = r2, mm5 available
  218. psubw mm3, mm6 ; mm3 = r3
  219. ; --- 4x4 iDCT done, now time to combine with mpr ---
  220. movq mm7, MMWORD PTR const32
  221. paddw mm0, mm7 ; rres + 32
  222. psraw mm0, 6 ; (rres + 32) >> 6
  223. paddw mm1, mm7 ; rres + 32
  224. psraw mm1, 6 ; (rres + 32) >> 6
  225. paddw mm2, mm7 ; rres + 32
  226. psraw mm2, 6 ; (rres + 32) >> 6
  227. paddw mm3, mm7 ; rres + 32
  228. psraw mm3, 6 ; (rres + 32) >> 6
  229. pxor mm7, mm7
  230. ; convert mpr from unsigned char to short
  231. movd mm4, DWORD PTR 0[ecx]
  232. movd mm5, DWORD PTR 16[ecx]
  233. movd mm6, DWORD PTR 32[ecx]
  234. punpcklbw mm4, mm7
  235. punpcklbw mm5, mm7
  236. punpcklbw mm6, mm7
  237. paddsw mm4, mm0 ; pred_row + rres_row
  238. movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3]
  239. paddsw mm5, mm1 ; pred_row + rres_row
  240. punpcklbw mm0, mm7
  241. paddsw mm6, mm2 ; pred_row + rres_row
  242. paddsw mm0, mm3 ; pred_row + rres_row
  243. ; results in mm4, mm5, mm6, mm0
  244. ; move back to 8 bit
  245. packuswb mm4, mm7
  246. packuswb mm5, mm7
  247. packuswb mm6, mm7
  248. packuswb mm0, mm7
  249. movd DWORD PTR 0[edx], mm4
  250. movd DWORD PTR 16[edx], mm5
  251. movd DWORD PTR 32[edx], mm6
  252. movd DWORD PTR 48[edx], mm0
  253. ret 0
  254. _itrans4x4_mmx_direct ENDP
  255. _itrans4x4_mmx ENDP
  256. _TEXT ENDS
  257. EXTRN _itrans_sp:PROC
  258. EXTRN _Inv_Residual_trans_4x4:PROC
  259. PUBLIC _iMBtrans4x4
  260. EXTRN _opt:BYTE
  261. _TEXT SEGMENT
  262. _currSlice$ = -4 ; size = 4
  263. _mb_rec$166704 = 8 ; size = 4
  264. _currMB$ = 8 ; size = 4
  265. _curr_img$ = 12 ; size = 4
  266. _pl$ = 8 ; second parameter
  267. _smb$ = 16 ; size = 4
  268. _iMBtrans4x4 PROC
  269. push ecx
  270. push ebx
  271. push ebp
  272. push esi
  273. STACKOFFSET = 16
  274. ; 408 : VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
  275. mov esi, DWORD PTR _pl$[esp+STACKOFFSET]
  276. push edi
  277. STACKOFFSET = STACKOFFSET + 4
  278. mov edi, DWORD PTR _currMB$[esp+16]
  279. mov ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice
  280. mov eax, DWORD PTR [edi+4]
  281. mov eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture;
  282. mov DWORD PTR _currSlice$[esp+20], ebp
  283. mov ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture]
  284. mov DWORD PTR _curr_img$[esp+16], ecx
  285. cmp DWORD PTR _smb$[esp+16], 0 ; if (smb)
  286. ; 413 : {
  287. ; 414 : h264_short_block_t *blocks = currSlice->cof4[pl];
  288. ; 415 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
  289. ; 416 :
  290. ; 417 : itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
  291. je $LN4@iMBtrans4x
  292. push 0
  293. push 0
  294. mov eax, esi
  295. shl eax, 9
  296. lea ebx, DWORD PTR [eax+ebp+cof@Slice]
  297. mov ecx, esi
  298. shl ecx, 8
  299. lea ebp, DWORD PTR [ecx+ebp+mb_pred@Slice]
  300. push esi
  301. push ebp
  302. push ebx
  303. mov eax, edi
  304. call _itrans_sp
  305. ; 418 : itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
  306. push 0
  307. push 4
  308. push esi
  309. lea edx, DWORD PTR [ebx+32]
  310. push ebp
  311. push edx
  312. mov eax, edi
  313. call _itrans_sp
  314. ; 419 : itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
  315. push 4
  316. push 0
  317. push esi
  318. lea eax, DWORD PTR [ebx+64]
  319. push ebp
  320. push eax
  321. mov eax, edi
  322. call _itrans_sp
  323. ; 420 : itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
  324. push 4
  325. push 4
  326. push esi
  327. lea ecx, DWORD PTR [ebx+96]
  328. push ebp
  329. push ecx
  330. mov eax, edi
  331. call _itrans_sp
  332. add esp, 80 ; 00000050H
  333. ; 421 : itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
  334. push 0
  335. push 8
  336. push esi
  337. lea edx, DWORD PTR [ebx+128]
  338. push ebp
  339. push edx
  340. mov eax, edi
  341. call _itrans_sp
  342. ; 422 : itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
  343. push 0
  344. push 12 ; 0000000cH
  345. push esi
  346. lea eax, DWORD PTR [ebx+160]
  347. push ebp
  348. push eax
  349. mov eax, edi
  350. call _itrans_sp
  351. ; 423 : itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
  352. push 4
  353. push 8
  354. push esi
  355. lea ecx, DWORD PTR [ebx+192]
  356. push ebp
  357. push ecx
  358. mov eax, edi
  359. call _itrans_sp
  360. ; 424 : itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
  361. push 4
  362. push 12 ; 0000000cH
  363. push esi
  364. lea edx, DWORD PTR [ebx+224]
  365. push ebp
  366. push edx
  367. mov eax, edi
  368. call _itrans_sp
  369. add esp, 80 ; 00000050H
  370. ; 425 : itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
  371. push 8
  372. push 0
  373. push esi
  374. lea eax, DWORD PTR [ebx+256]
  375. push ebp
  376. push eax
  377. mov eax, edi
  378. call _itrans_sp
  379. ; 426 : itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
  380. push 8
  381. push 4
  382. push esi
  383. push ebp
  384. lea ecx, DWORD PTR [ebx+288]
  385. push ecx
  386. mov eax, edi
  387. call _itrans_sp
  388. ; 427 : itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
  389. push 12 ; 0000000cH
  390. push 0
  391. push esi
  392. lea edx, DWORD PTR [ebx+320]
  393. push ebp
  394. push edx
  395. mov eax, edi
  396. call _itrans_sp
  397. ; 428 : itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
  398. push 12 ; 0000000cH
  399. push 4
  400. push esi
  401. lea eax, DWORD PTR [ebx+352]
  402. push ebp
  403. push eax
  404. mov eax, edi
  405. call _itrans_sp
  406. add esp, 80 ; 00000050H
  407. ; 429 : itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
  408. push 8
  409. push 8
  410. push esi
  411. lea ecx, DWORD PTR [ebx+384]
  412. push ebp
  413. push ecx
  414. mov eax, edi
  415. call _itrans_sp
  416. ; 430 : itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
  417. push 8
  418. push 12 ; 0000000cH
  419. push esi
  420. lea edx, DWORD PTR [ebx+416]
  421. push ebp
  422. push edx
  423. mov eax, edi
  424. call _itrans_sp
  425. ; 431 : itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
  426. push 12 ; 0000000cH
  427. push 8
  428. push esi
  429. lea eax, DWORD PTR [ebx+448]
  430. push ebp
  431. push eax
  432. mov eax, edi
  433. call _itrans_sp
  434. ; 432 : itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
  435. push 12 ; 0000000cH
  436. push 12 ; 0000000cH
  437. push esi
  438. add ebx, 480 ; 000001e0H
  439. push ebp
  440. push ebx
  441. mov eax, edi
  442. call _itrans_sp
  443. mov ebp, DWORD PTR _currSlice$[esp+100]
  444. add esp, 80 ; 00000050H
  445. jmp COPY_16x16
  446. $LN4@iMBtrans4x:
  447. ; 433 : }
  448. ; 434 : else if (currMB->is_lossless)
  449. cmp DWORD PTR [edi+84], 0
  450. je $LN2@iMBtrans4x
  451. push 0
  452. push 0
  453. ; 435 : {
  454. ; 436 : Inv_Residual_trans_4x4(currMB, pl, 0, 0);
  455. push esi
  456. push edi
  457. call _Inv_Residual_trans_4x4
  458. ; 437 : Inv_Residual_trans_4x4(currMB, pl, 4, 0);
  459. push 0
  460. push 4
  461. push esi
  462. push edi
  463. call _Inv_Residual_trans_4x4
  464. ; 438 : Inv_Residual_trans_4x4(currMB, pl, 0, 4);
  465. push 4
  466. push 0
  467. push esi
  468. push edi
  469. call _Inv_Residual_trans_4x4
  470. ; 439 : Inv_Residual_trans_4x4(currMB, pl, 4, 4);
  471. push 4
  472. push 4
  473. push esi
  474. push edi
  475. call _Inv_Residual_trans_4x4
  476. add esp, 64 ; 00000040H
  477. ; 440 : Inv_Residual_trans_4x4(currMB, pl, 8, 0);
  478. push 0
  479. push 8
  480. push esi
  481. push edi
  482. call _Inv_Residual_trans_4x4
  483. ; 441 : Inv_Residual_trans_4x4(currMB, pl, 12, 0);
  484. push 0
  485. push 12 ; 0000000cH
  486. push esi
  487. push edi
  488. call _Inv_Residual_trans_4x4
  489. ; 442 : Inv_Residual_trans_4x4(currMB, pl, 8, 4);
  490. push 4
  491. push 8
  492. push esi
  493. push edi
  494. call _Inv_Residual_trans_4x4
  495. ; 443 : Inv_Residual_trans_4x4(currMB, pl, 12, 4);
  496. push 4
  497. push 12 ; 0000000cH
  498. push esi
  499. push edi
  500. call _Inv_Residual_trans_4x4
  501. add esp, 64 ; 00000040H
  502. ; 444 : Inv_Residual_trans_4x4(currMB, pl, 0, 8);
  503. push 8
  504. push 0
  505. push esi
  506. push edi
  507. call _Inv_Residual_trans_4x4
  508. ; 445 : Inv_Residual_trans_4x4(currMB, pl, 4, 8);
  509. push 8
  510. push 4
  511. push esi
  512. push edi
  513. call _Inv_Residual_trans_4x4
  514. ; 446 : Inv_Residual_trans_4x4(currMB, pl, 0, 12);
  515. push 12 ; 0000000cH
  516. push 0
  517. push esi
  518. push edi
  519. call _Inv_Residual_trans_4x4
  520. ; 447 : Inv_Residual_trans_4x4(currMB, pl, 4, 12);
  521. push 12 ; 0000000cH
  522. push 4
  523. push esi
  524. push edi
  525. call _Inv_Residual_trans_4x4
  526. add esp, 64 ; 00000040H
  527. ; 448 : Inv_Residual_trans_4x4(currMB, pl, 8, 8);
  528. push 8
  529. push 8
  530. push esi
  531. push edi
  532. call _Inv_Residual_trans_4x4
  533. ; 449 : Inv_Residual_trans_4x4(currMB, pl, 12, 8);
  534. push 8
  535. push 12 ; 0000000cH
  536. push esi
  537. push edi
  538. call _Inv_Residual_trans_4x4
  539. ; 450 : Inv_Residual_trans_4x4(currMB, pl, 8, 12);
  540. push 12 ; 0000000cH
  541. push 8
  542. push esi
  543. push edi
  544. call _Inv_Residual_trans_4x4
  545. ; 451 : Inv_Residual_trans_4x4(currMB, pl, 12, 12);
  546. push 12 ; 0000000cH
  547. push 12 ; 0000000cH
  548. push esi
  549. push edi
  550. call _Inv_Residual_trans_4x4
  551. add esp, 64 ; 00000040H
  552. ; 452 : }
  553. ; 453 : else
  554. jmp COPY_16x16
  555. $LN2@iMBtrans4x:
  556. ; 454 : {
  557. ; 455 : const h264_short_block_t *blocks = currSlice->cof4[pl];
  558. ; 456 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
  559. mov edx, esi
  560. mov ecx, esi
  561. shl edx, 8
  562. shl ecx, 9
  563. lea eax, DWORD PTR [edx+ebp]
  564. lea ebx, DWORD PTR [ecx+ebp+cof@Slice]
  565. ; 457 : h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
  566. ; put things in registers that itrans4x4_mmx_direct wants
  567. lea edx, [eax + mb_rec@Slice]; mb_rec
  568. lea ecx, [eax + mb_pred@Slice] ; mb_pred
  569. mov eax, ebx ; blocks
  570. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
  571. lea edx, [edx+4]
  572. lea ecx, [ecx+4]
  573. lea eax, [ebx+32]
  574. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
  575. lea edx, [edx+4]
  576. lea ecx, [ecx+4]
  577. lea eax, [ebx+128]
  578. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
  579. lea edx, [edx+4]
  580. lea ecx, [ecx+4]
  581. lea eax, [ebx+160]
  582. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
  583. ; second row
  584. lea edx, [edx+52]
  585. lea ecx, [ecx+52]
  586. lea eax, [ebx+64]
  587. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
  588. lea edx, [edx+4]
  589. lea ecx, [ecx+4]
  590. lea eax, [ebx+96]
  591. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
  592. lea edx, [edx+4]
  593. lea ecx, [ecx+4]
  594. lea eax, [ebx+192]
  595. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
  596. lea edx, [edx+4]
  597. lea ecx, [ecx+4]
  598. lea eax, [ebx+224]
  599. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
  600. ; third row
  601. lea edx, [edx+52]
  602. lea ecx, [ecx+52]
  603. lea eax, [ebx+256]
  604. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
  605. lea edx, [edx+4]
  606. lea ecx, [ecx+4]
  607. lea eax, [ebx+288]
  608. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
  609. lea edx, [edx+4]
  610. lea ecx, [ecx+4]
  611. lea eax, [ebx+384]
  612. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
  613. lea edx, [edx+4]
  614. lea ecx, [ecx+4]
  615. lea eax, [ebx+416]
  616. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
  617. ; fourth row
  618. lea edx, [edx+52]
  619. lea ecx, [ecx+52]
  620. lea eax, [ebx+320]
  621. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
  622. lea edx, [edx+4]
  623. lea ecx, [ecx+4]
  624. lea eax, [ebx+352]
  625. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
  626. lea edx, [edx+4]
  627. lea ecx, [ecx+4]
  628. lea eax, [ebx+448]
  629. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
  630. lea edx, [edx+4]
  631. lea ecx, [ecx+4]
  632. lea eax, [ebx+480]
  633. call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
  634. COPY_16x16:
  635. ; construct picture from 4x4 blocks
  636. ; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
  637. mov eax, DWORD PTR [edi+40]
  638. mov ecx, DWORD PTR [edi+36]
  639. shl esi, 8
  640. lea edx, DWORD PTR [esi+ebp+mb_rec@Slice]
  641. push edx
  642. mov edx, DWORD PTR _curr_img$[esp+20]
  643. push eax
  644. push ecx
  645. push edx
  646. call DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions
  647. add esp, 16 ; 00000010H
  648. pop edi
  649. pop esi
  650. pop ebp
  651. pop ebx
  652. pop ecx
  653. ret 0
  654. _iMBtrans4x4 ENDP
  655. _TEXT ENDS
  656. _TEXT SEGMENT
  657. ALIGN 2
  658. PUBLIC _itrans8x8_sse2
  659. _itrans8x8_sse2 PROC NEAR
  660. ; parameter 1(mb_rec): 8 + ebp
  661. ; parameter 2(mb_pred): 12 + ebp
  662. ; parameter 3(block): 16 + ebp
  663. ; parameter 4(pos_x): 20 + ebp
  664. push ebp
  665. mov ebp, esp
  666. and esp, -16
  667. sub esp, 176
  668. mov edx, DWORD PTR [ebp+20]
  669. mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec
  670. add ecx, edx
  671. add edx, DWORD PTR [ebp+12] ; edx: mb_pred
  672. mov eax, DWORD PTR [ebp+16] ; eax: block
  673. ;;; __m128i a0, a1, a2, a3;
  674. ;;; __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
  675. ;;; __m128i b0, b1, b2, b3, b4, b5, b6, b7;
  676. ;;; __m128i r0, r1, r2, r3, r4, r5, r6, r7;
  677. ;;; __m128i const32, zero;
  678. ;;; __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
  679. ;;; __m128i pred0, pred1;
  680. ;;;
  681. ;;; const32 = _mm_load_si128((const __m128i *)c32);
  682. movdqa xmm0, XMMWORD PTR const32
  683. ;;; zero = _mm_setzero_si128();
  684. ;;;
  685. ;;; // Horizontal
  686. ;;; b0 = _mm_load_si128((__m128i *)(block[0]));
  687. movdqa xmm4, XMMWORD PTR [eax]
  688. ;;; b1 = _mm_load_si128((__m128i *)(block[1]));
  689. movdqa xmm7, XMMWORD PTR [eax+16]
  690. ;;; b2 = _mm_load_si128((__m128i *)(block[2]));
  691. movdqa xmm5, XMMWORD PTR [eax+32]
  692. ;;; b3 = _mm_load_si128((__m128i *)(block[3]));
  693. movdqa xmm3, XMMWORD PTR [eax+48]
  694. ;;; b4 = _mm_load_si128((__m128i *)(block[4]));
  695. movdqa xmm6, XMMWORD PTR [eax+64]
  696. ;;; b5 = _mm_load_si128((__m128i *)(block[5]));
  697. ;;; b6 = _mm_load_si128((__m128i *)(block[6]));
  698. movdqa xmm1, XMMWORD PTR [eax+96]
  699. ;;; b7 = _mm_load_si128((__m128i *)(block[7]));
  700. movdqa xmm2, XMMWORD PTR [eax+112]
  701. movdqa XMMWORD PTR [esp], xmm0
  702. movdqa xmm0, XMMWORD PTR [eax+80]
  703. movdqa XMMWORD PTR [esp+16], xmm2
  704. ;;;
  705. ;;; /* rotate 8x8 (ugh) */
  706. ;;; r0 = _mm_unpacklo_epi16(b0, b2);
  707. movdqa xmm2, xmm4
  708. punpcklwd xmm2, xmm5
  709. ;;; r1 = _mm_unpacklo_epi16(b1, b3);
  710. ;;; r2 = _mm_unpackhi_epi16(b0, b2);
  711. punpckhwd xmm4, xmm5
  712. ;;; r3 = _mm_unpackhi_epi16(b1, b3);
  713. ;;; r4 = _mm_unpacklo_epi16(b4, b6);
  714. ;;; r5 = _mm_unpacklo_epi16(b5, b7);
  715. movdqa xmm5, xmm0
  716. movdqa XMMWORD PTR [esp+32], xmm2
  717. movdqa xmm2, xmm7
  718. punpcklwd xmm2, xmm3
  719. punpckhwd xmm7, xmm3
  720. movdqa xmm3, xmm6
  721. punpcklwd xmm3, xmm1
  722. movdqa XMMWORD PTR [esp+48], xmm3
  723. movdqa xmm3, XMMWORD PTR [esp+16]
  724. punpcklwd xmm5, xmm3
  725. ;;; r6 = _mm_unpackhi_epi16(b4, b6);
  726. punpckhwd xmm6, xmm1
  727. ;;; r7 = _mm_unpackhi_epi16(b5, b7);
  728. punpckhwd xmm0, xmm3
  729. ;;;
  730. ;;; b0 = _mm_unpacklo_epi16(r0, r1);
  731. movdqa xmm3, XMMWORD PTR [esp+32]
  732. movdqa xmm1, xmm3
  733. punpcklwd xmm1, xmm2
  734. ;;; b1 = _mm_unpackhi_epi16(r0, r1);
  735. punpckhwd xmm3, xmm2
  736. ;;; b2 = _mm_unpacklo_epi16(r2, r3);
  737. movdqa xmm2, xmm4
  738. punpcklwd xmm2, xmm7
  739. ;;; b3 = _mm_unpackhi_epi16(r2, r3);
  740. punpckhwd xmm4, xmm7
  741. movdqa XMMWORD PTR [esp+64], xmm4
  742. ;;; b4 = _mm_unpacklo_epi16(r4, r5);
  743. movdqa xmm4, XMMWORD PTR [esp+48]
  744. movdqa xmm7, xmm4
  745. punpcklwd xmm7, xmm5
  746. ;;; b5 = _mm_unpackhi_epi16(r4, r5);
  747. punpckhwd xmm4, xmm5
  748. ;;; b6 = _mm_unpacklo_epi16(r6, r7);
  749. movdqa xmm5, xmm6
  750. punpcklwd xmm5, xmm0
  751. ;;; b7 = _mm_unpackhi_epi16(r6, r7);
  752. punpckhwd xmm6, xmm0
  753. ;;;
  754. ;;; p0 = _mm_unpacklo_epi64(b0, b4);
  755. movdqa xmm0, xmm1
  756. punpcklqdq xmm0, xmm7
  757. ;;; p1 = _mm_unpackhi_epi64(b0, b4);
  758. punpckhqdq xmm1, xmm7
  759. movdqa XMMWORD PTR [esp+16], xmm1
  760. ;;; p2 = _mm_unpacklo_epi64(b1, b5);
  761. movdqa xmm1, xmm3
  762. punpcklqdq xmm1, xmm4
  763. ;;; p3 = _mm_unpackhi_epi64(b1, b5);
  764. ;;; p4 = _mm_unpacklo_epi64(b2, b6);
  765. ;;; p5 = _mm_unpackhi_epi64(b2, b6);
  766. ;;; p6 = _mm_unpacklo_epi64(b3, b7);
  767. ;;; p7 = _mm_unpackhi_epi64(b3, b7);
  768. ;;;
  769. ;;; /* perform approx DCT */
  770. ;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
  771. ;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
  772. ;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
  773. movdqa xmm7, xmm1
  774. psraw xmm7, 1
  775. punpckhqdq xmm3, xmm4
  776. movdqa XMMWORD PTR [esp+32], xmm3
  777. movdqa xmm3, xmm2
  778. punpcklqdq xmm3, xmm5
  779. punpckhqdq xmm2, xmm5
  780. movdqa xmm5, XMMWORD PTR [esp+64]
  781. movdqa xmm4, xmm5
  782. punpcklqdq xmm4, xmm6
  783. punpckhqdq xmm5, xmm6
  784. movdqa xmm6, xmm0
  785. paddw xmm6, xmm3
  786. psubw xmm0, xmm3
  787. ;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
  788. movdqa xmm3, xmm4
  789. ;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
  790. psraw xmm4, 1
  791. psubw xmm3, xmm7
  792. ;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
  793. paddw xmm1, xmm4
  794. ;;;
  795. ;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
  796. movdqa xmm4, xmm6
  797. ;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
  798. movdqa xmm7, xmm0
  799. paddw xmm4, xmm1
  800. psubw xmm7, xmm3
  801. movdqa XMMWORD PTR [esp+48], xmm7
  802. ;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
  803. paddw xmm0, xmm3
  804. movdqa XMMWORD PTR [esp+80], xmm0
  805. ;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
  806. ;;;
  807. ;;; //-p3 + p5 - p7 - (p7 >> 1);
  808. ;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
  809. ;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
  810. movdqa xmm0, XMMWORD PTR [esp+32]
  811. psubw xmm6, xmm1
  812. movdqa xmm1, xmm5
  813. psraw xmm1, 1
  814. movdqa xmm3, xmm2
  815. ;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
  816. ;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
  817. ;;;
  818. ;;; //p1 + p7 - p3 - (p3 >> 1);
  819. ;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
  820. movdqa xmm7, xmm0
  821. movdqa XMMWORD PTR [esp+96], xmm6
  822. ;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
  823. movdqa xmm6, XMMWORD PTR [esp+16]
  824. psubw xmm3, xmm0
  825. psubw xmm3, xmm5
  826. psraw xmm7, 1
  827. psubw xmm3, xmm1
  828. movdqa xmm1, xmm6
  829. paddw xmm1, xmm5
  830. ;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
  831. psubw xmm1, xmm0
  832. ;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
  833. psubw xmm1, xmm7
  834. ;;;
  835. ;;; // -p1 + p7 + p5 + (p5 >> 1);
  836. ;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
  837. movdqa xmm7, xmm2
  838. psraw xmm7, 1
  839. ;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
  840. psubw xmm5, xmm6
  841. ;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
  842. paddw xmm5, xmm2
  843. ;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
  844. paddw xmm5, xmm7
  845. ;;;
  846. ;;; // p3 + p5 + p1 + (p1 >> 1);
  847. ;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
  848. paddw xmm0, xmm2
  849. ;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
  850. ;;; p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
  851. ;;; a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
  852. ;;;
  853. ;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
  854. ;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
  855. ;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
  856. ;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
  857. ;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
  858. ;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
  859. ;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
  860. ;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
  861. ;;;
  862. ;;; p0 = _mm_add_epi16(b0, b7); // b0 + b7;
  863. ;;; p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
  864. ;;; p2 = _mm_add_epi16(b4, b3); // b4 + b3;
  865. ;;; p3 = _mm_add_epi16(b6, b1); // b6 + b1;
  866. movdqa xmm2, XMMWORD PTR [esp+96]
  867. paddw xmm0, xmm6
  868. psraw xmm6, 1
  869. paddw xmm0, xmm6
  870. movdqa xmm7, xmm0
  871. movdqa xmm6, xmm5
  872. psraw xmm7, 2
  873. paddw xmm7, xmm3
  874. psraw xmm6, 2
  875. paddw xmm6, xmm1
  876. psraw xmm1, 2
  877. psubw xmm5, xmm1
  878. movdqa xmm1, xmm4
  879. psraw xmm3, 2
  880. psubw xmm0, xmm3
  881. movdqa xmm3, XMMWORD PTR [esp+80]
  882. movdqa XMMWORD PTR [esp+32], xmm0
  883. ;;; p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
  884. ;;; p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
  885. ;;; p6 = _mm_add_epi16(b2, b5); // b2 + b5;
  886. ;;; p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
  887. psubw xmm4, XMMWORD PTR [esp+32]
  888. paddw xmm1, xmm0
  889. movdqa XMMWORD PTR [esp+112], xmm1
  890. movdqa xmm1, XMMWORD PTR [esp+48]
  891. movdqa xmm0, xmm1
  892. psubw xmm0, xmm5
  893. movdqa XMMWORD PTR [esp+16], xmm0
  894. movdqa xmm0, xmm3
  895. paddw xmm0, xmm6
  896. psubw xmm3, xmm6
  897. movdqa XMMWORD PTR [esp+128], xmm0
  898. ;;;
  899. ;;; /* rotate 8x8 (ugh) */
  900. ;;; r0 = _mm_unpacklo_epi16(p0, p2);
  901. movdqa xmm6, XMMWORD PTR [esp+128]
  902. movdqa xmm0, xmm2
  903. paddw xmm0, xmm7
  904. psubw xmm2, xmm7
  905. paddw xmm1, xmm5
  906. movdqa xmm5, XMMWORD PTR [esp+112]
  907. movdqa XMMWORD PTR [esp+144], xmm4
  908. movdqa xmm4, xmm5
  909. punpcklwd xmm4, xmm6
  910. ;;; r1 = _mm_unpacklo_epi16(p1, p3);
  911. ;;; r2 = _mm_unpackhi_epi16(p0, p2);
  912. punpckhwd xmm5, xmm6
  913. ;;; r3 = _mm_unpackhi_epi16(p1, p3);
  914. ;;; r4 = _mm_unpacklo_epi16(p4, p6);
  915. ;;; r5 = _mm_unpacklo_epi16(p5, p7);
  916. movdqa xmm6, xmm3
  917. movdqa XMMWORD PTR [esp+64], xmm4
  918. movdqa xmm4, XMMWORD PTR [esp+16]
  919. movdqa xmm7, xmm4
  920. punpcklwd xmm7, xmm0
  921. punpckhwd xmm4, xmm0
  922. movdqa xmm0, xmm2
  923. punpcklwd xmm0, xmm1
  924. movdqa XMMWORD PTR [esp+128], xmm0
  925. movdqa xmm0, XMMWORD PTR [esp+144]
  926. punpcklwd xmm6, xmm0
  927. ;;; r6 = _mm_unpackhi_epi16(p4, p6);
  928. punpckhwd xmm2, xmm1
  929. ;;; r7 = _mm_unpackhi_epi16(p5, p7);
  930. ;;;
  931. ;;; b0 = _mm_unpacklo_epi16(r0, r1);
  932. movdqa xmm1, XMMWORD PTR [esp+64]
  933. punpckhwd xmm3, xmm0
  934. movdqa xmm0, xmm1
  935. punpcklwd xmm0, xmm7
  936. ;;; b1 = _mm_unpackhi_epi16(r0, r1);
  937. punpckhwd xmm1, xmm7
  938. ;;; b2 = _mm_unpacklo_epi16(r2, r3);
  939. movdqa xmm7, xmm5
  940. punpcklwd xmm7, xmm4
  941. ;;; b3 = _mm_unpackhi_epi16(r2, r3);
  942. punpckhwd xmm5, xmm4
  943. movdqa XMMWORD PTR [esp+112], xmm5
  944. ;;; b4 = _mm_unpacklo_epi16(r4, r5);
  945. movdqa xmm5, XMMWORD PTR [esp+128]
  946. movdqa xmm4, xmm5
  947. punpcklwd xmm4, xmm6
  948. ;;; b5 = _mm_unpackhi_epi16(r4, r5);
  949. punpckhwd xmm5, xmm6
  950. ;;; b6 = _mm_unpacklo_epi16(r6, r7);
  951. movdqa xmm6, xmm2
  952. punpcklwd xmm6, xmm3
  953. ;;; b7 = _mm_unpackhi_epi16(r6, r7);
  954. punpckhwd xmm2, xmm3
  955. ;;;
  956. ;;; p0 = _mm_unpacklo_epi64(b0, b4);
  957. movdqa xmm3, xmm0
  958. punpcklqdq xmm3, xmm4
  959. ;;; p1 = _mm_unpackhi_epi64(b0, b4);
  960. punpckhqdq xmm0, xmm4
  961. movdqa XMMWORD PTR [esp+144], xmm0
  962. ;;; p2 = _mm_unpacklo_epi64(b1, b5);
  963. ;;; p3 = _mm_unpackhi_epi64(b1, b5);
  964. ;;; p4 = _mm_unpacklo_epi64(b2, b6);
  965. ;;; p5 = _mm_unpackhi_epi64(b2, b6);
  966. ;;; p6 = _mm_unpacklo_epi64(b3, b7);
  967. movdqa xmm0, XMMWORD PTR [esp+112]
  968. movdqa xmm4, xmm1
  969. punpcklqdq xmm4, xmm5
  970. punpckhqdq xmm1, xmm5
  971. movdqa XMMWORD PTR [esp+64], xmm1
  972. movdqa xmm1, xmm7
  973. movdqa xmm5, xmm0
  974. punpcklqdq xmm1, xmm6
  975. punpckhqdq xmm7, xmm6
  976. ;;; p7 = _mm_unpackhi_epi64(b3, b7);
  977. ;;;
  978. ;;;
  979. ;;; /* Vertical */
  980. ;;;
  981. ;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
  982. ;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
  983. ;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
  984. movdqa xmm6, xmm4
  985. psraw xmm6, 1
  986. punpcklqdq xmm5, xmm2
  987. punpckhqdq xmm0, xmm2
  988. movdqa xmm2, xmm3
  989. paddw xmm2, xmm1
  990. psubw xmm3, xmm1
  991. ;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
  992. movdqa xmm1, xmm5
  993. ;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
  994. psraw xmm5, 1
  995. psubw xmm1, xmm6
  996. ;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
  997. paddw xmm4, xmm5
  998. ;;;
  999. ;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
  1000. movdqa xmm5, xmm2
  1001. ;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
  1002. movdqa xmm6, xmm3
  1003. paddw xmm5, xmm4
  1004. psubw xmm6, xmm1
  1005. movdqa XMMWORD PTR [esp+128], xmm6
  1006. ;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
  1007. ;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
  1008. ;;;
  1009. ;;; //-p3 + p5 - p7 - (p7 >> 1);
  1010. ;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
  1011. ;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
  1012. movdqa xmm6, XMMWORD PTR [esp+64]
  1013. paddw xmm3, xmm1
  1014. movdqa XMMWORD PTR [esp+80], xmm3
  1015. psubw xmm2, xmm4
  1016. movdqa xmm1, xmm0
  1017. psraw xmm1, 1
  1018. movdqa xmm3, xmm7
  1019. movdqa XMMWORD PTR [esp+96], xmm2
  1020. psubw xmm3, xmm6
  1021. ;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
  1022. psubw xmm3, xmm0
  1023. ;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
  1024. ;;;
  1025. ;;; //p1 + p7 - p3 - (p3 >> 1);
  1026. ;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
  1027. movdqa xmm2, xmm6
  1028. psraw xmm2, 1
  1029. psubw xmm3, xmm1
  1030. ;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
  1031. movdqa xmm1, XMMWORD PTR [esp+144]
  1032. movdqa xmm4, xmm1
  1033. paddw xmm4, xmm0
  1034. ;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
  1035. psubw xmm4, xmm6
  1036. ;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
  1037. psubw xmm4, xmm2
  1038. ;;;
  1039. ;;; // -p1 + p7 + p5 + (p5 >> 1);
  1040. ;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
  1041. movdqa xmm2, xmm7
  1042. psraw xmm2, 1
  1043. ;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
  1044. psubw xmm0, xmm1
  1045. ;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
  1046. paddw xmm0, xmm7
  1047. ;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
  1048. paddw xmm0, xmm2
  1049. ;;;
  1050. ;;; // p3 + p5 + p1 + (p1 >> 1);
  1051. ;;; r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
  1052. movdqa xmm2, xmm1
  1053. psraw xmm2, 1
  1054. ;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
  1055. paddw xmm6, xmm7
  1056. ;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
  1057. ;;; a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
  1058. ;;;
  1059. ;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
  1060. ;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
  1061. ;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
  1062. ;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
  1063. ;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
  1064. ;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
  1065. ;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
  1066. ;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
  1067. ;;;
  1068. ;;; r0 = _mm_add_epi16(b0, b7); // b0 + b7;
  1069. ;;; r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
  1070. movdqa xmm7, XMMWORD PTR [esp+128]
  1071. paddw xmm6, xmm1
  1072. paddw xmm6, xmm2
  1073. movdqa xmm1, xmm6
  1074. psraw xmm1, 2
  1075. movdqa xmm2, xmm0
  1076. paddw xmm1, xmm3
  1077. psraw xmm2, 2
  1078. paddw xmm2, xmm4
  1079. psraw xmm4, 2
  1080. psubw xmm0, xmm4
  1081. psraw xmm3, 2
  1082. psubw xmm6, xmm3
  1083. movdqa XMMWORD PTR [esp+64], xmm6
  1084. movdqa xmm3, xmm5
  1085. ;;; r2 = _mm_add_epi16(b4, b3); // b4 + b3;
  1086. ;;; r3 = _mm_add_epi16(b6, b1); // b6 + b1;
  1087. ;;; r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
  1088. ;;; r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
  1089. ;;; r6 = _mm_add_epi16(b2, b5); // b2 + b5;
  1090. ;;; r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
  1091. psubw xmm5, XMMWORD PTR [esp+64]
  1092. paddw xmm3, xmm6
  1093. movdqa XMMWORD PTR [esp+144], xmm3
  1094. movdqa xmm3, xmm7
  1095. psubw xmm3, xmm0
  1096. movdqa XMMWORD PTR [esp+48], xmm3
  1097. movdqa xmm3, XMMWORD PTR [esp+80]
  1098. movdqa xmm4, xmm3
  1099. paddw xmm4, xmm2
  1100. psubw xmm3, xmm2
  1101. ;;;
  1102. ;;;
  1103. ;;; // add in prediction values
  1104. ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
  1105. ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
  1106. ;;; // (x + 32) >> 6
  1107. ;;; r0 = _mm_adds_epi16(r0, const32);
  1108. movdqa xmm2, XMMWORD PTR const32
  1109. movdqa XMMWORD PTR [esp+16], xmm4
  1110. movdqa xmm4, XMMWORD PTR [esp+96]
  1111. movdqa xmm6, xmm4
  1112. paddw xmm6, xmm1
  1113. psubw xmm4, xmm1
  1114. ;;; r0 = _mm_srai_epi16(r0, 6);
  1115. ;;; r1 = _mm_adds_epi16(r1, const32);
  1116. movdqa xmm1, XMMWORD PTR [esp+48]
  1117. paddw xmm7, xmm0
  1118. movdqa xmm0, XMMWORD PTR [esp+144]
  1119. movdqa XMMWORD PTR [esp+128], xmm7
  1120. ;;; r1 = _mm_srai_epi16(r1, 6);
  1121. ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  1122. ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  1123. movq xmm7, QWORD PTR [edx+16]
  1124. movdqa XMMWORD PTR [esp+32], xmm5
  1125. paddsw xmm0, xmm2
  1126. psraw xmm0, 6
  1127. paddsw xmm1, xmm2
  1128. pxor xmm2, xmm2
  1129. punpcklbw xmm7, xmm2
  1130. movq xmm5, QWORD PTR [edx]
  1131. punpcklbw xmm5, xmm2
  1132. psraw xmm1, 6
  1133. ;;; pred0 = _mm_adds_epi16(pred0, r0);
  1134. ;;; pred1 = _mm_adds_epi16(pred1, r1);
  1135. paddsw xmm7, xmm1
  1136. paddsw xmm5, xmm0
  1137. ;;;
  1138. ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  1139. packuswb xmm5, xmm7
  1140. ;;;
  1141. ;;; // store
  1142. ;;; _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
  1143. movdqa xmm0, XMMWORD PTR [esp+32]
  1144. movdqa xmm2, XMMWORD PTR [esp+128]
  1145. movq QWORD PTR [ecx], xmm5
  1146. ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  1147. ;;; pred0 = _mm_srli_si128(pred0, 8);
  1148. psrldq xmm5, 8
  1149. ;;; _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
  1150. movq QWORD PTR [ecx+16], xmm5
  1151. ;;;
  1152. ;;; /* --- */
  1153. ;;;
  1154. ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
  1155. movq xmm1, QWORD PTR [edx+32]
  1156. ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
  1157. ;;; // (x + 32) >> 6
  1158. ;;; r2 = _mm_adds_epi16(r2, const32);
  1159. movdqa xmm5, XMMWORD PTR [esp]
  1160. movdqa XMMWORD PTR [esp+32], xmm0 ;
  1161. ;;; r2 = _mm_srai_epi16(r2, 6);
  1162. ;;; r3 = _mm_adds_epi16(r3, const32);
  1163. paddsw xmm6, xmm5
  1164. ;;; r3 = _mm_srai_epi16(r3, 6);
  1165. psraw xmm6, 6
  1166. ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  1167. pxor xmm7, xmm7
  1168. punpcklbw xmm1, xmm7
  1169. movdqa xmm0, XMMWORD PTR [esp+16]
  1170. paddsw xmm0, xmm5
  1171. psraw xmm0, 6
  1172. ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  1173. ;;; pred0 = _mm_adds_epi16(pred0, r2);
  1174. paddsw xmm1, xmm0
  1175. ;;; pred1 = _mm_adds_epi16(pred1, r3);
  1176. ;;;
  1177. ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  1178. ;;;
  1179. ;;; // store
  1180. ;;; _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
  1181. movdqa xmm0, XMMWORD PTR [esp+32]
  1182. movq xmm5, QWORD PTR [edx+48]
  1183. punpcklbw xmm5, xmm7
  1184. paddsw xmm5, xmm6
  1185. packuswb xmm1, xmm5
  1186. movq QWORD PTR [ecx+32], xmm1
  1187. ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  1188. ;;; pred0 = _mm_srli_si128(pred0, 8);
  1189. psrldq xmm1, 8
  1190. ;;; _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
  1191. movq QWORD PTR [ecx+48], xmm1
  1192. ;;;
  1193. ;;; /* --- */
  1194. ;;;
  1195. ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
  1196. movq xmm7, QWORD PTR [edx+64]
  1197. ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
  1198. movq xmm6, QWORD PTR [edx+80]
  1199. ;;; // (x + 32) >> 6
  1200. ;;; r4 = _mm_adds_epi16(r4, const32);
  1201. ;;; r4 = _mm_srai_epi16(r4, 6);
  1202. ;;; r5 = _mm_adds_epi16(r5, const32);
  1203. ;;; r5 = _mm_srai_epi16(r5, 6);
  1204. ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  1205. pxor xmm5, xmm5
  1206. punpcklbw xmm7, xmm5
  1207. ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  1208. punpcklbw xmm6, xmm5
  1209. movdqa xmm1, XMMWORD PTR [esp]
  1210. paddsw xmm4, xmm1
  1211. psraw xmm4, 6
  1212. paddsw xmm3, xmm1
  1213. psraw xmm3, 6
  1214. ;;; pred0 = _mm_adds_epi16(pred0, r4);
  1215. paddsw xmm7, xmm4
  1216. ;;; pred1 = _mm_adds_epi16(pred1, r5);
  1217. paddsw xmm6, xmm3
  1218. ;;;
  1219. ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  1220. packuswb xmm7, xmm6
  1221. ;;;
  1222. ;;; // store
  1223. ;;; _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
  1224. movq QWORD PTR [ecx+64], xmm7
  1225. ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  1226. ;;; pred0 = _mm_srli_si128(pred0, 8);
  1227. psrldq xmm7, 8
  1228. ;;; _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
  1229. movq QWORD PTR [ecx+80], xmm7
  1230. ;;;
  1231. ;;; /* --- */
  1232. ;;;
  1233. ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
  1234. movq xmm5, QWORD PTR [edx+96]
  1235. ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
  1236. movq xmm4, QWORD PTR [edx+112]
  1237. ;;; // (x + 32) >> 6
  1238. ;;; r6 = _mm_adds_epi16(r6, const32);
  1239. ;;; r6 = _mm_srai_epi16(r6, 6);
  1240. ;;; r7 = _mm_adds_epi16(r7, const32);
  1241. ;;; r7 = _mm_srai_epi16(r7, 6);
  1242. ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
  1243. pxor xmm3, xmm3
  1244. punpcklbw xmm5, xmm3
  1245. ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
  1246. punpcklbw xmm4, xmm3
  1247. movdqa xmm1, XMMWORD PTR [esp]
  1248. paddsw xmm2, xmm1
  1249. psraw xmm2, 6
  1250. paddsw xmm0, xmm1
  1251. psraw xmm0, 6
  1252. ;;; pred0 = _mm_adds_epi16(pred0, r6);
  1253. paddsw xmm5, xmm2
  1254. ;;; pred1 = _mm_adds_epi16(pred1, r7);
  1255. paddsw xmm4, xmm0
  1256. ;;;
  1257. ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
  1258. packuswb xmm5, xmm4
  1259. ;;;
  1260. ;;; // store
  1261. ;;; _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
  1262. movq QWORD PTR [ecx+96], xmm5
  1263. ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
  1264. ;;; pred0 = _mm_srli_si128(pred0, 8);
  1265. psrldq xmm5, 8
  1266. ;;; _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
  1267. movq QWORD PTR [ecx+112], xmm5
  1268. mov esp, ebp
  1269. pop ebp
  1270. ret
  1271. ALIGN 2
  1272. _itrans8x8_sse2 ENDP
  1273. END