123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626 |
- .686
- .XMM
- .model FLAT
- copy_image_data_16x16_stride@OptimizedFunctions = 32
- dec_picture@VideoParameters = 698192
- p_Slice@MacroBlock = 0
- plane_images@StorablePicture = 158512
- mb_rec@Slice = 1696
- mb_pred@Slice = 928
- cof@Slice = 2464
- CONST SEGMENT
- align 16
- const32 DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H
- CONST ENDS
- ;
- ;
- ;
- ;
- PUBLIC _weighted_bi_prediction4x4
- _TEXT SEGMENT
- mb_pred = 4
- block_l0 = 8
- wp_scale_l0 = 12
- wp_scale_l1 = 16
- wp_offset = 20
- weight_denom = 24
- _weighted_bi_prediction4x4 PROC ; COMDAT
- mov eax, DWORD PTR weight_denom[esp]
- pxor mm0, mm0
- pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0
- test eax, eax
- pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0
- pshufw mm3, MMWORD PTR wp_offset[esp], 0
- jle BI_PRED4x4@LEFT_SHIFT
- movd mm4, eax
- lea ecx, DWORD PTR [eax-1] ;
- mov edx, 1
- shl edx, cl
- movd mm5, edx
- mov eax, mb_pred[esp]
- mov edx, block_l0[esp]
- pshufw mm5, mm5, 0
- movd mm6, DWORD PTR 0[edx] ; block_l0
- movd mm7, DWORD PTR 0[eax] ; mb_pred
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- movd mm7, DWORD PTR 16[eax] ; mb_pred
- paddw mm6, mm5
- psraw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 0[eax], mm6
- movd mm6, DWORD PTR 16[edx] ; block_l0
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- movd mm7, DWORD PTR 32[eax] ; mb_pred
- paddw mm6, mm5
- psraw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 16[eax], mm6
- movd mm6, DWORD PTR 32[edx] ; block_l0
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- movd mm7, DWORD PTR 48[eax] ; mb_pred
- paddw mm6, mm5
- psraw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 0[eax], mm6
- movd mm6, DWORD PTR 48[edx] ; block_l0
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- paddw mm6, mm5
- psraw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 48[eax], mm6
- ret 0
- BI_PRED4x4@LEFT_SHIFT:
- neg eax
- movd mm4, eax
- mov eax, mb_pred[esp]
- mov edx, block_l0[esp]
- movd mm6, DWORD PTR 0[edx] ; block_l0
- movd mm7, DWORD PTR 0[eax] ; mb_pred
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- movd mm7, DWORD PTR 16[eax] ; mb_pred
- psllw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 0[eax], mm6
- movd mm6, DWORD PTR 16[edx] ; block_l0
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- movd mm7, DWORD PTR 32[eax] ; mb_pred
- psllw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 16[eax], mm6
- movd mm6, DWORD PTR 32[edx] ; block_l0
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- movd mm7, DWORD PTR 48[eax] ; mb_pred
- psllw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 0[eax], mm6
- movd mm6, DWORD PTR 48[edx] ; block_l0
- punpcklbw mm6, mm0
- punpcklbw mm7, mm0
- pmullw mm6, mm1
- pmullw mm7, mm2
- paddw mm6, mm7
- psllw mm6, mm4
- paddw mm6, mm3
- packuswb mm6, mm6
- movd DWORD PTR 48[eax], mm6
- ret 0
- _weighted_bi_prediction4x4 ENDP
- _TEXT ENDS
- PUBLIC _itrans4x4_mmx
- _TEXT SEGMENT
- _tblock$ = 4 ; size = 4
- _mb_pred$ = 8 ; size = 4
- _mb_rec$ = 12 ; size = 4
- _pos_x$ = 16 ; size = 4
- _pos_y$ = 20 ; size = 4
- _itrans4x4_mmx PROC ; COMDAT
- mov edx, DWORD PTR _pos_y$[esp]
- shl edx, 4
- add edx, DWORD PTR _pos_x$[esp]
- mov eax, DWORD PTR _tblock$[esp]
- mov ecx, DWORD PTR _mb_pred$[esp]
- add ecx, edx
- add edx, DWORD PTR _mb_rec$[esp]
- _itrans4x4_mmx_direct PROC ; COMDAT
- ; load 4x4 matrix
- movq mm0, MMWORD PTR 0[eax]
- movq mm1, MMWORD PTR 8[eax]
- movq mm2, MMWORD PTR 16[eax]
- movq mm3, MMWORD PTR 24[eax]
- ; rotate 4x4 matrix
- movq mm4, mm0 ; p0 = mm4 (copy)
- punpcklwd mm0, mm2 ; r0 = mm0
- punpckhwd mm4, mm2 ; r2 = mm4
- movq mm5, mm1 ; p1 = mm5 (copy)
- punpcklwd mm1, mm3 ; r1 = mm1
- punpckhwd mm5, mm3 ; r3 = mm5
- movq mm6, mm0 ; r0 = mm6 (copy)
- punpcklwd mm0, mm1 ; t0 = mm0
- punpckhwd mm6, mm1 ; t1 = mm6
- movq mm1, mm4 ; r2 = mm1 (copy)
- punpcklwd mm1, mm5 ; t2 = mm1
- punpckhwd mm4, mm5 ; t3 = mm4
- movq mm2, mm0 ; mm2 = t0 (copy)
- paddw mm0, mm1 ; mm0 = p0
- psubw mm2, mm1 ; mm2 = p1, mm1 available
- movq mm5, mm6 ; mm5 = t1 (copy)
- psraw mm5, 1 ; mm5 = (t1 >> 1)
- psubw mm5, mm4 ; mm5 = p2
- psraw mm4, 1 ; mm4 = (t3 >> 1)
- paddw mm6, mm4 ; mm6 = p3
- movq mm3, mm0 ; mm3 = p0 (copy)
- paddw mm0, mm6 ; mm0 = r0
- movq mm1, mm2 ; mm1 = p1 (copy)
- paddw mm1, mm5 ; mm1 = r1
- psubw mm2, mm5 ; mm2 = r2, mm5 available
- psubw mm3, mm6 ; mm3 = r3
- ; rotate 4x4 matrix to set up for vertical
- movq mm4, mm0 ; r0 = mm4 (copy)
- punpcklwd mm0, mm2 ; p0 = mm0
- punpckhwd mm4, mm2 ; p2 = mm4
- movq mm5, mm1 ; r1 = mm5 (copy)
- punpcklwd mm1, mm3 ; p1 = mm1
- punpckhwd mm5, mm3 ; p3 = mm5
- movq mm6, mm0 ; p0 = mm6 (copy)
- punpcklwd mm0, mm1 ; t0 = mm0
- punpckhwd mm6, mm1 ; t1 = mm6
- movq mm1, mm4 ; p2 = mm1 (copy)
- punpcklwd mm1, mm5 ; t2 = mm1
- punpckhwd mm4, mm5 ; t3 = mm4
- movq mm2, mm0 ; mm2 = t0 (copy)
- paddw mm0, mm1 ; mm0 = p0
- psubw mm2, mm1 ; mm2 = p1, mm1 available
- movq mm5, mm6 ; mm5 = t1 (copy)
- psraw mm5, 1 ; mm5 = (t1 >> 1)
- psubw mm5, mm4 ; mm5 = p2
- psraw mm4, 1 ; mm4 = (t3 >> 1)
- paddw mm6, mm4 ; mm6 = p3
- movq mm3, mm0 ; mm3 = p0 (copy)
- paddw mm0, mm6 ; mm0 = r0
- movq mm1, mm2 ; mm1 = p1 (copy)
- paddw mm1, mm5 ; mm1 = r1
- psubw mm2, mm5 ; mm2 = r2, mm5 available
- psubw mm3, mm6 ; mm3 = r3
- ; --- 4x4 iDCT done, now time to combine with mpr ---
- movq mm7, MMWORD PTR const32
- paddw mm0, mm7 ; rres + 32
- psraw mm0, 6 ; (rres + 32) >> 6
- paddw mm1, mm7 ; rres + 32
- psraw mm1, 6 ; (rres + 32) >> 6
- paddw mm2, mm7 ; rres + 32
- psraw mm2, 6 ; (rres + 32) >> 6
- paddw mm3, mm7 ; rres + 32
- psraw mm3, 6 ; (rres + 32) >> 6
- pxor mm7, mm7
- ; convert mpr from unsigned char to short
- movd mm4, DWORD PTR 0[ecx]
- movd mm5, DWORD PTR 16[ecx]
- movd mm6, DWORD PTR 32[ecx]
- punpcklbw mm4, mm7
- punpcklbw mm5, mm7
- punpcklbw mm6, mm7
- paddsw mm4, mm0 ; pred_row + rres_row
- movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3]
- paddsw mm5, mm1 ; pred_row + rres_row
- punpcklbw mm0, mm7
- paddsw mm6, mm2 ; pred_row + rres_row
- paddsw mm0, mm3 ; pred_row + rres_row
- ; results in mm4, mm5, mm6, mm0
-
- ; move back to 8 bit
- packuswb mm4, mm7
- packuswb mm5, mm7
- packuswb mm6, mm7
- packuswb mm0, mm7
- movd DWORD PTR 0[edx], mm4
- movd DWORD PTR 16[edx], mm5
- movd DWORD PTR 32[edx], mm6
- movd DWORD PTR 48[edx], mm0
- ret 0
- _itrans4x4_mmx_direct ENDP
- _itrans4x4_mmx ENDP
- _TEXT ENDS
- EXTRN _itrans_sp:PROC
- EXTRN _Inv_Residual_trans_4x4:PROC
- PUBLIC _iMBtrans4x4
- EXTRN _opt:BYTE
- _TEXT SEGMENT
- _currSlice$ = -4 ; size = 4
- _mb_rec$166704 = 8 ; size = 4
- _currMB$ = 8 ; size = 4
- _curr_img$ = 12 ; size = 4
- _pl$ = 8 ; second parameter
- _smb$ = 16 ; size = 4
- _iMBtrans4x4 PROC
- push ecx
- push ebx
- push ebp
- push esi
- STACKOFFSET = 16
- ; 408 : VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
- mov esi, DWORD PTR _pl$[esp+STACKOFFSET]
- push edi
- STACKOFFSET = STACKOFFSET + 4
- mov edi, DWORD PTR _currMB$[esp+16]
- mov ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice
- mov eax, DWORD PTR [edi+4]
- mov eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture;
- mov DWORD PTR _currSlice$[esp+20], ebp
- mov ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture]
- mov DWORD PTR _curr_img$[esp+16], ecx
-
- cmp DWORD PTR _smb$[esp+16], 0 ; if (smb)
- ; 413 : {
- ; 414 : h264_short_block_t *blocks = currSlice->cof4[pl];
- ; 415 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
- ; 416 :
- ; 417 : itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
- je $LN4@iMBtrans4x
- push 0
- push 0
- mov eax, esi
- shl eax, 9
- lea ebx, DWORD PTR [eax+ebp+cof@Slice]
- mov ecx, esi
- shl ecx, 8
- lea ebp, DWORD PTR [ecx+ebp+mb_pred@Slice]
- push esi
- push ebp
- push ebx
- mov eax, edi
- call _itrans_sp
- ; 418 : itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
- push 0
- push 4
- push esi
- lea edx, DWORD PTR [ebx+32]
- push ebp
- push edx
- mov eax, edi
- call _itrans_sp
- ; 419 : itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
- push 4
- push 0
- push esi
- lea eax, DWORD PTR [ebx+64]
- push ebp
- push eax
- mov eax, edi
- call _itrans_sp
- ; 420 : itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
- push 4
- push 4
- push esi
- lea ecx, DWORD PTR [ebx+96]
- push ebp
- push ecx
- mov eax, edi
- call _itrans_sp
- add esp, 80 ; 00000050H
- ; 421 : itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
- push 0
- push 8
- push esi
- lea edx, DWORD PTR [ebx+128]
- push ebp
- push edx
- mov eax, edi
- call _itrans_sp
- ; 422 : itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
- push 0
- push 12 ; 0000000cH
- push esi
- lea eax, DWORD PTR [ebx+160]
- push ebp
- push eax
- mov eax, edi
- call _itrans_sp
- ; 423 : itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
- push 4
- push 8
- push esi
- lea ecx, DWORD PTR [ebx+192]
- push ebp
- push ecx
- mov eax, edi
- call _itrans_sp
- ; 424 : itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
- push 4
- push 12 ; 0000000cH
- push esi
- lea edx, DWORD PTR [ebx+224]
- push ebp
- push edx
- mov eax, edi
- call _itrans_sp
- add esp, 80 ; 00000050H
- ; 425 : itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
- push 8
- push 0
- push esi
- lea eax, DWORD PTR [ebx+256]
- push ebp
- push eax
- mov eax, edi
- call _itrans_sp
- ; 426 : itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
- push 8
- push 4
- push esi
- push ebp
- lea ecx, DWORD PTR [ebx+288]
- push ecx
- mov eax, edi
- call _itrans_sp
- ; 427 : itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
- push 12 ; 0000000cH
- push 0
- push esi
- lea edx, DWORD PTR [ebx+320]
- push ebp
- push edx
- mov eax, edi
- call _itrans_sp
- ; 428 : itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
- push 12 ; 0000000cH
- push 4
- push esi
- lea eax, DWORD PTR [ebx+352]
- push ebp
- push eax
- mov eax, edi
- call _itrans_sp
- add esp, 80 ; 00000050H
- ; 429 : itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
- push 8
- push 8
- push esi
- lea ecx, DWORD PTR [ebx+384]
- push ebp
- push ecx
- mov eax, edi
- call _itrans_sp
- ; 430 : itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
- push 8
- push 12 ; 0000000cH
- push esi
- lea edx, DWORD PTR [ebx+416]
- push ebp
- push edx
- mov eax, edi
- call _itrans_sp
- ; 431 : itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
- push 12 ; 0000000cH
- push 8
- push esi
- lea eax, DWORD PTR [ebx+448]
- push ebp
- push eax
- mov eax, edi
- call _itrans_sp
- ; 432 : itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
- push 12 ; 0000000cH
- push 12 ; 0000000cH
- push esi
- add ebx, 480 ; 000001e0H
- push ebp
- push ebx
- mov eax, edi
- call _itrans_sp
- mov ebp, DWORD PTR _currSlice$[esp+100]
- add esp, 80 ; 00000050H
- jmp COPY_16x16
-
- $LN4@iMBtrans4x:
- ; 433 : }
- ; 434 : else if (currMB->is_lossless)
- cmp DWORD PTR [edi+84], 0
- je $LN2@iMBtrans4x
- push 0
- push 0
- ; 435 : {
- ; 436 : Inv_Residual_trans_4x4(currMB, pl, 0, 0);
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 437 : Inv_Residual_trans_4x4(currMB, pl, 4, 0);
- push 0
- push 4
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 438 : Inv_Residual_trans_4x4(currMB, pl, 0, 4);
- push 4
- push 0
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 439 : Inv_Residual_trans_4x4(currMB, pl, 4, 4);
- push 4
- push 4
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- add esp, 64 ; 00000040H
- ; 440 : Inv_Residual_trans_4x4(currMB, pl, 8, 0);
- push 0
- push 8
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 441 : Inv_Residual_trans_4x4(currMB, pl, 12, 0);
- push 0
- push 12 ; 0000000cH
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 442 : Inv_Residual_trans_4x4(currMB, pl, 8, 4);
- push 4
- push 8
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 443 : Inv_Residual_trans_4x4(currMB, pl, 12, 4);
- push 4
- push 12 ; 0000000cH
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- add esp, 64 ; 00000040H
- ; 444 : Inv_Residual_trans_4x4(currMB, pl, 0, 8);
- push 8
- push 0
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 445 : Inv_Residual_trans_4x4(currMB, pl, 4, 8);
- push 8
- push 4
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 446 : Inv_Residual_trans_4x4(currMB, pl, 0, 12);
- push 12 ; 0000000cH
- push 0
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 447 : Inv_Residual_trans_4x4(currMB, pl, 4, 12);
- push 12 ; 0000000cH
- push 4
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- add esp, 64 ; 00000040H
- ; 448 : Inv_Residual_trans_4x4(currMB, pl, 8, 8);
- push 8
- push 8
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 449 : Inv_Residual_trans_4x4(currMB, pl, 12, 8);
- push 8
- push 12 ; 0000000cH
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 450 : Inv_Residual_trans_4x4(currMB, pl, 8, 12);
- push 12 ; 0000000cH
- push 8
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- ; 451 : Inv_Residual_trans_4x4(currMB, pl, 12, 12);
- push 12 ; 0000000cH
- push 12 ; 0000000cH
- push esi
- push edi
- call _Inv_Residual_trans_4x4
- add esp, 64 ; 00000040H
- ; 452 : }
- ; 453 : else
- jmp COPY_16x16
- $LN2@iMBtrans4x:
- ; 454 : {
- ; 455 : const h264_short_block_t *blocks = currSlice->cof4[pl];
- ; 456 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
- mov edx, esi
- mov ecx, esi
- shl edx, 8
- shl ecx, 9
- lea eax, DWORD PTR [edx+ebp]
- lea ebx, DWORD PTR [ecx+ebp+cof@Slice]
- ; 457 : h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
-
- ; put things in registers that itrans4x4_mmx_direct wants
- lea edx, [eax + mb_rec@Slice]; mb_rec
- lea ecx, [eax + mb_pred@Slice] ; mb_pred
- mov eax, ebx ; blocks
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+32]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+128]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+160]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
- ; second row
- lea edx, [edx+52]
- lea ecx, [ecx+52]
- lea eax, [ebx+64]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+96]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+192]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+224]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
- ; third row
- lea edx, [edx+52]
- lea ecx, [ecx+52]
- lea eax, [ebx+256]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+288]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+384]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+416]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
-
- ; fourth row
- lea edx, [edx+52]
- lea ecx, [ecx+52]
- lea eax, [ebx+320]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+352]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
-
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+448]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
- lea edx, [edx+4]
- lea ecx, [ecx+4]
- lea eax, [ebx+480]
- call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
- COPY_16x16:
- ; construct picture from 4x4 blocks
- ; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
- mov eax, DWORD PTR [edi+40]
- mov ecx, DWORD PTR [edi+36]
- shl esi, 8
- lea edx, DWORD PTR [esi+ebp+mb_rec@Slice]
- push edx
- mov edx, DWORD PTR _curr_img$[esp+20]
- push eax
- push ecx
- push edx
- call DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions
- add esp, 16 ; 00000010H
- pop edi
- pop esi
- pop ebp
- pop ebx
- pop ecx
- ret 0
- _iMBtrans4x4 ENDP
- _TEXT ENDS
- _TEXT SEGMENT
- ALIGN 2
- PUBLIC _itrans8x8_sse2
- _itrans8x8_sse2 PROC NEAR
- ; parameter 1(mb_rec): 8 + ebp
- ; parameter 2(mb_pred): 12 + ebp
- ; parameter 3(block): 16 + ebp
- ; parameter 4(pos_x): 20 + ebp
- push ebp
- mov ebp, esp
- and esp, -16
- sub esp, 176
- mov edx, DWORD PTR [ebp+20]
- mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec
- add ecx, edx
- add edx, DWORD PTR [ebp+12] ; edx: mb_pred
- mov eax, DWORD PTR [ebp+16] ; eax: block
- ;;; __m128i a0, a1, a2, a3;
- ;;; __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
- ;;; __m128i b0, b1, b2, b3, b4, b5, b6, b7;
- ;;; __m128i r0, r1, r2, r3, r4, r5, r6, r7;
- ;;; __m128i const32, zero;
- ;;; __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
- ;;; __m128i pred0, pred1;
- ;;;
- ;;; const32 = _mm_load_si128((const __m128i *)c32);
- movdqa xmm0, XMMWORD PTR const32
- ;;; zero = _mm_setzero_si128();
- ;;;
- ;;; // Horizontal
- ;;; b0 = _mm_load_si128((__m128i *)(block[0]));
- movdqa xmm4, XMMWORD PTR [eax]
- ;;; b1 = _mm_load_si128((__m128i *)(block[1]));
- movdqa xmm7, XMMWORD PTR [eax+16]
- ;;; b2 = _mm_load_si128((__m128i *)(block[2]));
- movdqa xmm5, XMMWORD PTR [eax+32]
- ;;; b3 = _mm_load_si128((__m128i *)(block[3]));
- movdqa xmm3, XMMWORD PTR [eax+48]
- ;;; b4 = _mm_load_si128((__m128i *)(block[4]));
- movdqa xmm6, XMMWORD PTR [eax+64]
- ;;; b5 = _mm_load_si128((__m128i *)(block[5]));
- ;;; b6 = _mm_load_si128((__m128i *)(block[6]));
- movdqa xmm1, XMMWORD PTR [eax+96]
- ;;; b7 = _mm_load_si128((__m128i *)(block[7]));
- movdqa xmm2, XMMWORD PTR [eax+112]
- movdqa XMMWORD PTR [esp], xmm0
- movdqa xmm0, XMMWORD PTR [eax+80]
- movdqa XMMWORD PTR [esp+16], xmm2
- ;;;
- ;;; /* rotate 8x8 (ugh) */
- ;;; r0 = _mm_unpacklo_epi16(b0, b2);
- movdqa xmm2, xmm4
- punpcklwd xmm2, xmm5
- ;;; r1 = _mm_unpacklo_epi16(b1, b3);
- ;;; r2 = _mm_unpackhi_epi16(b0, b2);
- punpckhwd xmm4, xmm5
- ;;; r3 = _mm_unpackhi_epi16(b1, b3);
- ;;; r4 = _mm_unpacklo_epi16(b4, b6);
- ;;; r5 = _mm_unpacklo_epi16(b5, b7);
- movdqa xmm5, xmm0
- movdqa XMMWORD PTR [esp+32], xmm2
- movdqa xmm2, xmm7
- punpcklwd xmm2, xmm3
- punpckhwd xmm7, xmm3
- movdqa xmm3, xmm6
- punpcklwd xmm3, xmm1
- movdqa XMMWORD PTR [esp+48], xmm3
- movdqa xmm3, XMMWORD PTR [esp+16]
- punpcklwd xmm5, xmm3
- ;;; r6 = _mm_unpackhi_epi16(b4, b6);
- punpckhwd xmm6, xmm1
- ;;; r7 = _mm_unpackhi_epi16(b5, b7);
- punpckhwd xmm0, xmm3
- ;;;
- ;;; b0 = _mm_unpacklo_epi16(r0, r1);
- movdqa xmm3, XMMWORD PTR [esp+32]
- movdqa xmm1, xmm3
- punpcklwd xmm1, xmm2
- ;;; b1 = _mm_unpackhi_epi16(r0, r1);
- punpckhwd xmm3, xmm2
- ;;; b2 = _mm_unpacklo_epi16(r2, r3);
- movdqa xmm2, xmm4
- punpcklwd xmm2, xmm7
- ;;; b3 = _mm_unpackhi_epi16(r2, r3);
- punpckhwd xmm4, xmm7
- movdqa XMMWORD PTR [esp+64], xmm4
- ;;; b4 = _mm_unpacklo_epi16(r4, r5);
- movdqa xmm4, XMMWORD PTR [esp+48]
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm5
- ;;; b5 = _mm_unpackhi_epi16(r4, r5);
- punpckhwd xmm4, xmm5
- ;;; b6 = _mm_unpacklo_epi16(r6, r7);
- movdqa xmm5, xmm6
- punpcklwd xmm5, xmm0
- ;;; b7 = _mm_unpackhi_epi16(r6, r7);
- punpckhwd xmm6, xmm0
- ;;;
- ;;; p0 = _mm_unpacklo_epi64(b0, b4);
- movdqa xmm0, xmm1
- punpcklqdq xmm0, xmm7
- ;;; p1 = _mm_unpackhi_epi64(b0, b4);
- punpckhqdq xmm1, xmm7
- movdqa XMMWORD PTR [esp+16], xmm1
- ;;; p2 = _mm_unpacklo_epi64(b1, b5);
- movdqa xmm1, xmm3
- punpcklqdq xmm1, xmm4
- ;;; p3 = _mm_unpackhi_epi64(b1, b5);
- ;;; p4 = _mm_unpacklo_epi64(b2, b6);
- ;;; p5 = _mm_unpackhi_epi64(b2, b6);
- ;;; p6 = _mm_unpacklo_epi64(b3, b7);
- ;;; p7 = _mm_unpackhi_epi64(b3, b7);
- ;;;
- ;;; /* perform approx DCT */
- ;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
- ;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
- ;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
- movdqa xmm7, xmm1
- psraw xmm7, 1
- punpckhqdq xmm3, xmm4
- movdqa XMMWORD PTR [esp+32], xmm3
- movdqa xmm3, xmm2
- punpcklqdq xmm3, xmm5
- punpckhqdq xmm2, xmm5
- movdqa xmm5, XMMWORD PTR [esp+64]
- movdqa xmm4, xmm5
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm6, xmm0
- paddw xmm6, xmm3
- psubw xmm0, xmm3
- ;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
- movdqa xmm3, xmm4
- ;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
- psraw xmm4, 1
- psubw xmm3, xmm7
- ;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
- paddw xmm1, xmm4
- ;;;
- ;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
- movdqa xmm4, xmm6
- ;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
- movdqa xmm7, xmm0
- paddw xmm4, xmm1
- psubw xmm7, xmm3
- movdqa XMMWORD PTR [esp+48], xmm7
- ;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
- paddw xmm0, xmm3
- movdqa XMMWORD PTR [esp+80], xmm0
- ;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
- ;;;
- ;;; //-p3 + p5 - p7 - (p7 >> 1);
- ;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
- ;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
- movdqa xmm0, XMMWORD PTR [esp+32]
- psubw xmm6, xmm1
- movdqa xmm1, xmm5
- psraw xmm1, 1
- movdqa xmm3, xmm2
- ;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
- ;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
- ;;;
- ;;; //p1 + p7 - p3 - (p3 >> 1);
- ;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
- movdqa xmm7, xmm0
- movdqa XMMWORD PTR [esp+96], xmm6
- ;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
- movdqa xmm6, XMMWORD PTR [esp+16]
- psubw xmm3, xmm0
- psubw xmm3, xmm5
- psraw xmm7, 1
- psubw xmm3, xmm1
- movdqa xmm1, xmm6
- paddw xmm1, xmm5
- ;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
- psubw xmm1, xmm0
- ;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
- psubw xmm1, xmm7
- ;;;
- ;;; // -p1 + p7 + p5 + (p5 >> 1);
- ;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
- movdqa xmm7, xmm2
- psraw xmm7, 1
- ;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
- psubw xmm5, xmm6
- ;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
- paddw xmm5, xmm2
- ;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
- paddw xmm5, xmm7
- ;;;
- ;;; // p3 + p5 + p1 + (p1 >> 1);
- ;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
- paddw xmm0, xmm2
- ;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
- ;;; p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
- ;;; a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
- ;;;
- ;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
- ;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
- ;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
- ;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
- ;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
- ;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
- ;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
- ;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
- ;;;
- ;;; p0 = _mm_add_epi16(b0, b7); // b0 + b7;
- ;;; p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
- ;;; p2 = _mm_add_epi16(b4, b3); // b4 + b3;
- ;;; p3 = _mm_add_epi16(b6, b1); // b6 + b1;
- movdqa xmm2, XMMWORD PTR [esp+96]
- paddw xmm0, xmm6
- psraw xmm6, 1
- paddw xmm0, xmm6
- movdqa xmm7, xmm0
- movdqa xmm6, xmm5
- psraw xmm7, 2
- paddw xmm7, xmm3
- psraw xmm6, 2
- paddw xmm6, xmm1
- psraw xmm1, 2
- psubw xmm5, xmm1
- movdqa xmm1, xmm4
- psraw xmm3, 2
- psubw xmm0, xmm3
- movdqa xmm3, XMMWORD PTR [esp+80]
- movdqa XMMWORD PTR [esp+32], xmm0
- ;;; p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
- ;;; p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
- ;;; p6 = _mm_add_epi16(b2, b5); // b2 + b5;
- ;;; p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
- psubw xmm4, XMMWORD PTR [esp+32]
- paddw xmm1, xmm0
- movdqa XMMWORD PTR [esp+112], xmm1
- movdqa xmm1, XMMWORD PTR [esp+48]
- movdqa xmm0, xmm1
- psubw xmm0, xmm5
- movdqa XMMWORD PTR [esp+16], xmm0
- movdqa xmm0, xmm3
- paddw xmm0, xmm6
- psubw xmm3, xmm6
- movdqa XMMWORD PTR [esp+128], xmm0
- ;;;
- ;;; /* rotate 8x8 (ugh) */
- ;;; r0 = _mm_unpacklo_epi16(p0, p2);
- movdqa xmm6, XMMWORD PTR [esp+128]
- movdqa xmm0, xmm2
- paddw xmm0, xmm7
- psubw xmm2, xmm7
- paddw xmm1, xmm5
- movdqa xmm5, XMMWORD PTR [esp+112]
- movdqa XMMWORD PTR [esp+144], xmm4
- movdqa xmm4, xmm5
- punpcklwd xmm4, xmm6
- ;;; r1 = _mm_unpacklo_epi16(p1, p3);
- ;;; r2 = _mm_unpackhi_epi16(p0, p2);
- punpckhwd xmm5, xmm6
- ;;; r3 = _mm_unpackhi_epi16(p1, p3);
- ;;; r4 = _mm_unpacklo_epi16(p4, p6);
- ;;; r5 = _mm_unpacklo_epi16(p5, p7);
- movdqa xmm6, xmm3
- movdqa XMMWORD PTR [esp+64], xmm4
- movdqa xmm4, XMMWORD PTR [esp+16]
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm0
- punpckhwd xmm4, xmm0
- movdqa xmm0, xmm2
- punpcklwd xmm0, xmm1
- movdqa XMMWORD PTR [esp+128], xmm0
- movdqa xmm0, XMMWORD PTR [esp+144]
- punpcklwd xmm6, xmm0
- ;;; r6 = _mm_unpackhi_epi16(p4, p6);
- punpckhwd xmm2, xmm1
- ;;; r7 = _mm_unpackhi_epi16(p5, p7);
- ;;;
- ;;; b0 = _mm_unpacklo_epi16(r0, r1);
- movdqa xmm1, XMMWORD PTR [esp+64]
- punpckhwd xmm3, xmm0
- movdqa xmm0, xmm1
- punpcklwd xmm0, xmm7
- ;;; b1 = _mm_unpackhi_epi16(r0, r1);
- punpckhwd xmm1, xmm7
- ;;; b2 = _mm_unpacklo_epi16(r2, r3);
- movdqa xmm7, xmm5
- punpcklwd xmm7, xmm4
- ;;; b3 = _mm_unpackhi_epi16(r2, r3);
- punpckhwd xmm5, xmm4
- movdqa XMMWORD PTR [esp+112], xmm5
- ;;; b4 = _mm_unpacklo_epi16(r4, r5);
- movdqa xmm5, XMMWORD PTR [esp+128]
- movdqa xmm4, xmm5
- punpcklwd xmm4, xmm6
- ;;; b5 = _mm_unpackhi_epi16(r4, r5);
- punpckhwd xmm5, xmm6
- ;;; b6 = _mm_unpacklo_epi16(r6, r7);
- movdqa xmm6, xmm2
- punpcklwd xmm6, xmm3
- ;;; b7 = _mm_unpackhi_epi16(r6, r7);
- punpckhwd xmm2, xmm3
- ;;;
- ;;; p0 = _mm_unpacklo_epi64(b0, b4);
- movdqa xmm3, xmm0
- punpcklqdq xmm3, xmm4
- ;;; p1 = _mm_unpackhi_epi64(b0, b4);
- punpckhqdq xmm0, xmm4
- movdqa XMMWORD PTR [esp+144], xmm0
- ;;; p2 = _mm_unpacklo_epi64(b1, b5);
- ;;; p3 = _mm_unpackhi_epi64(b1, b5);
- ;;; p4 = _mm_unpacklo_epi64(b2, b6);
- ;;; p5 = _mm_unpackhi_epi64(b2, b6);
- ;;; p6 = _mm_unpacklo_epi64(b3, b7);
- movdqa xmm0, XMMWORD PTR [esp+112]
- movdqa xmm4, xmm1
- punpcklqdq xmm4, xmm5
- punpckhqdq xmm1, xmm5
- movdqa XMMWORD PTR [esp+64], xmm1
- movdqa xmm1, xmm7
- movdqa xmm5, xmm0
- punpcklqdq xmm1, xmm6
- punpckhqdq xmm7, xmm6
- ;;; p7 = _mm_unpackhi_epi64(b3, b7);
- ;;;
- ;;;
- ;;; /* Vertical */
- ;;;
- ;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
- ;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
- ;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
- movdqa xmm6, xmm4
- psraw xmm6, 1
- punpcklqdq xmm5, xmm2
- punpckhqdq xmm0, xmm2
- movdqa xmm2, xmm3
- paddw xmm2, xmm1
- psubw xmm3, xmm1
- ;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
- movdqa xmm1, xmm5
- ;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
- psraw xmm5, 1
- psubw xmm1, xmm6
- ;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
- paddw xmm4, xmm5
- ;;;
- ;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
- movdqa xmm5, xmm2
- ;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
- movdqa xmm6, xmm3
- paddw xmm5, xmm4
- psubw xmm6, xmm1
- movdqa XMMWORD PTR [esp+128], xmm6
- ;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
- ;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
- ;;;
- ;;; //-p3 + p5 - p7 - (p7 >> 1);
- ;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
- ;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
- movdqa xmm6, XMMWORD PTR [esp+64]
- paddw xmm3, xmm1
- movdqa XMMWORD PTR [esp+80], xmm3
- psubw xmm2, xmm4
- movdqa xmm1, xmm0
- psraw xmm1, 1
- movdqa xmm3, xmm7
- movdqa XMMWORD PTR [esp+96], xmm2
- psubw xmm3, xmm6
- ;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
- psubw xmm3, xmm0
- ;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
- ;;;
- ;;; //p1 + p7 - p3 - (p3 >> 1);
- ;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
- movdqa xmm2, xmm6
- psraw xmm2, 1
- psubw xmm3, xmm1
- ;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
- movdqa xmm1, XMMWORD PTR [esp+144]
- movdqa xmm4, xmm1
- paddw xmm4, xmm0
- ;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
- psubw xmm4, xmm6
- ;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
- psubw xmm4, xmm2
- ;;;
- ;;; // -p1 + p7 + p5 + (p5 >> 1);
- ;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
- movdqa xmm2, xmm7
- psraw xmm2, 1
- ;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
- psubw xmm0, xmm1
- ;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
- paddw xmm0, xmm7
- ;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
- paddw xmm0, xmm2
- ;;;
- ;;; // p3 + p5 + p1 + (p1 >> 1);
- ;;; r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
- movdqa xmm2, xmm1
- psraw xmm2, 1
- ;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
- paddw xmm6, xmm7
- ;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
- ;;; a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
- ;;;
- ;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
- ;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
- ;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
- ;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
- ;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
- ;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
- ;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
- ;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
- ;;;
- ;;; r0 = _mm_add_epi16(b0, b7); // b0 + b7;
- ;;; r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
- movdqa xmm7, XMMWORD PTR [esp+128]
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- movdqa xmm1, xmm6
- psraw xmm1, 2
- movdqa xmm2, xmm0
- paddw xmm1, xmm3
- psraw xmm2, 2
- paddw xmm2, xmm4
- psraw xmm4, 2
- psubw xmm0, xmm4
- psraw xmm3, 2
- psubw xmm6, xmm3
- movdqa XMMWORD PTR [esp+64], xmm6
- movdqa xmm3, xmm5
- ;;; r2 = _mm_add_epi16(b4, b3); // b4 + b3;
- ;;; r3 = _mm_add_epi16(b6, b1); // b6 + b1;
- ;;; r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
- ;;; r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
- ;;; r6 = _mm_add_epi16(b2, b5); // b2 + b5;
- ;;; r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
- psubw xmm5, XMMWORD PTR [esp+64]
- paddw xmm3, xmm6
- movdqa XMMWORD PTR [esp+144], xmm3
- movdqa xmm3, xmm7
- psubw xmm3, xmm0
- movdqa XMMWORD PTR [esp+48], xmm3
- movdqa xmm3, XMMWORD PTR [esp+80]
- movdqa xmm4, xmm3
- paddw xmm4, xmm2
- psubw xmm3, xmm2
- ;;;
- ;;;
- ;;; // add in prediction values
- ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
- ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
- ;;; // (x + 32) >> 6
- ;;; r0 = _mm_adds_epi16(r0, const32);
- movdqa xmm2, XMMWORD PTR const32
- movdqa XMMWORD PTR [esp+16], xmm4
- movdqa xmm4, XMMWORD PTR [esp+96]
- movdqa xmm6, xmm4
- paddw xmm6, xmm1
- psubw xmm4, xmm1
- ;;; r0 = _mm_srai_epi16(r0, 6);
- ;;; r1 = _mm_adds_epi16(r1, const32);
- movdqa xmm1, XMMWORD PTR [esp+48]
- paddw xmm7, xmm0
- movdqa xmm0, XMMWORD PTR [esp+144]
- movdqa XMMWORD PTR [esp+128], xmm7
- ;;; r1 = _mm_srai_epi16(r1, 6);
- ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- movq xmm7, QWORD PTR [edx+16]
- movdqa XMMWORD PTR [esp+32], xmm5
- paddsw xmm0, xmm2
- psraw xmm0, 6
- paddsw xmm1, xmm2
- pxor xmm2, xmm2
- punpcklbw xmm7, xmm2
- movq xmm5, QWORD PTR [edx]
- punpcklbw xmm5, xmm2
- psraw xmm1, 6
- ;;; pred0 = _mm_adds_epi16(pred0, r0);
- ;;; pred1 = _mm_adds_epi16(pred1, r1);
- paddsw xmm7, xmm1
- paddsw xmm5, xmm0
- ;;;
- ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- packuswb xmm5, xmm7
- ;;;
- ;;; // store
- ;;; _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
- movdqa xmm0, XMMWORD PTR [esp+32]
- movdqa xmm2, XMMWORD PTR [esp+128]
- movq QWORD PTR [ecx], xmm5
- ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- ;;; pred0 = _mm_srli_si128(pred0, 8);
- psrldq xmm5, 8
- ;;; _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
- movq QWORD PTR [ecx+16], xmm5
- ;;;
- ;;; /* --- */
- ;;;
- ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
- movq xmm1, QWORD PTR [edx+32]
- ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
- ;;; // (x + 32) >> 6
- ;;; r2 = _mm_adds_epi16(r2, const32);
- movdqa xmm5, XMMWORD PTR [esp]
- movdqa XMMWORD PTR [esp+32], xmm0 ;
- ;;; r2 = _mm_srai_epi16(r2, 6);
- ;;; r3 = _mm_adds_epi16(r3, const32);
- paddsw xmm6, xmm5
- ;;; r3 = _mm_srai_epi16(r3, 6);
- psraw xmm6, 6
- ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pxor xmm7, xmm7
- punpcklbw xmm1, xmm7
- movdqa xmm0, XMMWORD PTR [esp+16]
- paddsw xmm0, xmm5
- psraw xmm0, 6
- ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- ;;; pred0 = _mm_adds_epi16(pred0, r2);
- paddsw xmm1, xmm0
- ;;; pred1 = _mm_adds_epi16(pred1, r3);
- ;;;
- ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- ;;;
- ;;; // store
- ;;; _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
- movdqa xmm0, XMMWORD PTR [esp+32]
- movq xmm5, QWORD PTR [edx+48]
- punpcklbw xmm5, xmm7
- paddsw xmm5, xmm6
- packuswb xmm1, xmm5
- movq QWORD PTR [ecx+32], xmm1
- ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- ;;; pred0 = _mm_srli_si128(pred0, 8);
- psrldq xmm1, 8
- ;;; _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
- movq QWORD PTR [ecx+48], xmm1
- ;;;
- ;;; /* --- */
- ;;;
- ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
- movq xmm7, QWORD PTR [edx+64]
- ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
- movq xmm6, QWORD PTR [edx+80]
- ;;; // (x + 32) >> 6
- ;;; r4 = _mm_adds_epi16(r4, const32);
- ;;; r4 = _mm_srai_epi16(r4, 6);
- ;;; r5 = _mm_adds_epi16(r5, const32);
- ;;; r5 = _mm_srai_epi16(r5, 6);
- ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pxor xmm5, xmm5
- punpcklbw xmm7, xmm5
- ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- punpcklbw xmm6, xmm5
- movdqa xmm1, XMMWORD PTR [esp]
- paddsw xmm4, xmm1
- psraw xmm4, 6
- paddsw xmm3, xmm1
- psraw xmm3, 6
- ;;; pred0 = _mm_adds_epi16(pred0, r4);
- paddsw xmm7, xmm4
- ;;; pred1 = _mm_adds_epi16(pred1, r5);
- paddsw xmm6, xmm3
- ;;;
- ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- packuswb xmm7, xmm6
- ;;;
- ;;; // store
- ;;; _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
- movq QWORD PTR [ecx+64], xmm7
- ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- ;;; pred0 = _mm_srli_si128(pred0, 8);
- psrldq xmm7, 8
- ;;; _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
- movq QWORD PTR [ecx+80], xmm7
- ;;;
- ;;; /* --- */
- ;;;
- ;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
- movq xmm5, QWORD PTR [edx+96]
- ;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
- movq xmm4, QWORD PTR [edx+112]
- ;;; // (x + 32) >> 6
- ;;; r6 = _mm_adds_epi16(r6, const32);
- ;;; r6 = _mm_srai_epi16(r6, 6);
- ;;; r7 = _mm_adds_epi16(r7, const32);
- ;;; r7 = _mm_srai_epi16(r7, 6);
- ;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pxor xmm3, xmm3
- punpcklbw xmm5, xmm3
- ;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- punpcklbw xmm4, xmm3
- movdqa xmm1, XMMWORD PTR [esp]
- paddsw xmm2, xmm1
- psraw xmm2, 6
- paddsw xmm0, xmm1
- psraw xmm0, 6
- ;;; pred0 = _mm_adds_epi16(pred0, r6);
- paddsw xmm5, xmm2
- ;;; pred1 = _mm_adds_epi16(pred1, r7);
- paddsw xmm4, xmm0
- ;;;
- ;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- packuswb xmm5, xmm4
- ;;;
- ;;; // store
- ;;; _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
- movq QWORD PTR [ecx+96], xmm5
- ;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- ;;; pred0 = _mm_srli_si128(pred0, 8);
- psrldq xmm5, 8
- ;;; _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
- movq QWORD PTR [ecx+112], xmm5
- mov esp, ebp
- pop ebp
- ret
- ALIGN 2
- _itrans8x8_sse2 ENDP
- END
|