12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828 |
- /****************************************************************************
- *
- * Module Title : DeblockwmtOpt.c
- *
- * Description : Optimized functions for deblocking
- *
- * AUTHOR : Yaowu Xu
- *
- *****************************************************************************
- * Revision History
- *
- * 1.02 YWX 08-Dec-00 Configuration baseline from deblockopt.c
- *
- *****************************************************************************
- */
-
- /****************************************************************************
- * Header Frames
- *****************************************************************************
- */
- #include "postp.h"
- #include "stdlib.h"
- #include <math.h>
- /****************************************************************************
- * Module constants.
- *****************************************************************************
- */
- #if defined(_WIN32_WCE)
- #else
- __declspec(align(16)) static short Eight128s[] = {128, 128, 128, 128,128, 128, 128, 128 };
- __declspec(align(16)) static short Eight64s[] = {64, 64, 64, 64, 64, 64, 64, 64 };
- __declspec(align(16)) static short EightThrees[]= {3, 3, 3, 3, 3, 3, 3, 3};
- __declspec(align(16)) static short EightFours[]= {4, 4, 4, 4, 4, 4, 4, 4};
- __declspec(align(16)) static short Four128s[] = {128, 128, 128, 128};
- __declspec(align(16)) static short Four64s[] = {64, 64, 64, 64 };
- __declspec(align(16)) static short FourThrees[]= {3, 3, 3, 3};
- __declspec(align(16)) static short FourFours[]= {4, 4, 4, 4};
- __declspec(align(16)) static short EightOnes[]= { 1, 1, 1, 1, 1, 1, 1, 1};
- #endif
- /****************************************************************************
- * Explicit Imports
- *****************************************************************************
- */
- extern double gaussian(double sigma, double mu, double x);
- extern UINT32 *DeblockLimitValuesV2;
- /****************************************************************************
- * Exported Global Variables
- *****************************************************************************
- */
- /****************************************************************************
- * Exported Functions
- *****************************************************************************
- */
- /****************************************************************************
- * Module Statics
- *****************************************************************************
- */
- /****************************************************************************
- *
- * ROUTINE : DeblockLoopFilteredBand_MMX
- *
- * INPUTS : None
- *
- * OUTPUTS : None
- *
- * RETURNS : None
- *
- * FUNCTION : Filter both horizontal and vertical edge in a band
- *
- * SPECIAL NOTES :
- *
- * REFERENCE :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void DeblockLoopFilteredBand_WMT(
- POSTPROC_INSTANCE *pbi,
- UINT8 *SrcPtr,
- UINT8 *DesPtr,
- UINT32 PlaneLineStep,
- UINT32 FragAcross,
- UINT32 StartFrag,
- UINT32 *QuantScale
- )
- {
- UINT32 j;
- UINT32 CurrentFrag=StartFrag;
- UINT32 QStep;
- UINT8 *Src, *Des;
- UINT32 Var1, Var2;
- #if defined(_WIN32_WCE)
- return;
- #else
- __declspec(align(16)) short QStepWMT[8];
- __declspec(align(16)) short FLimitWMT[8];
- __declspec(align(16)) short Rows[80];
- __declspec(align(16)) unsigned short Variance1[8];
- __declspec(align(16)) unsigned short Variance2[8];
- Src=SrcPtr;
- Des=DesPtr;
- while(CurrentFrag < StartFrag + FragAcross )
- {
-
- QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
- if( QStep > 3 )
- {
- QStepWMT[0] = (INT16)QStep;
- QStepWMT[1] = (INT16)QStep;
- QStepWMT[2] = (INT16)QStep;
- QStepWMT[3] = (INT16)QStep;
- QStepWMT[4] = (INT16)QStep;
- QStepWMT[5] = (INT16)QStep;
- QStepWMT[6] = (INT16)QStep;
- QStepWMT[7] = (INT16)QStep;
- __asm
- {
-
- /* Save the registers */
- push eax
- push ecx
- push edx
- push esi
- push edi
-
-
- /* Calculate the FLimit and store FLimit and QStep */
-
- movdqa xmm0, QStepWMT /* xmm0 = QStep */
- movdqa xmm1, EightThrees /* mm1 = 03030303 */
- pmullw xmm1, xmm0 /* mm1 = QStep * 3 */
- pmullw xmm1, xmm0 /* mm1 = QStep * QStep * 3 */
-
- psrlw xmm1, 5 /* mm1 = FLimit */
- movdqa [FLimitWMT], xmm1 /* Save FLimit */
-
- /* setup the pointers */
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
- mov esi, Des /* esi = Des */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- pxor xmm7, xmm7 /* Clear xmm7 */
-
- sub edx, ecx /* edx = -Pitch */
-
- lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
- lea esi, [esi + edx * 2 ] /* esi = Des - 2 * Pitch */
- /* Copy the data to the intermediate buffer */
-
- movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-5*Pitch] */
- movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[-4*Pitch */
-
- punpcklbw xmm0, xmm7 /* expand to words */
- punpcklbw xmm1, xmm7 /* expand to words */
- movdqa [edi], xmm0 /* write 8 words */
- movdqa [edi+16], xmm1 /* write 8 words */
- movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[-3*Pitch] */
- movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[-2*Pitch] */
- punpcklbw xmm2, xmm7 /* expand to words */
- punpcklbw xmm3, xmm7 /* expand to words */
-
- movdqa [edi+32], xmm2 /* write 8 words */
- movdqa [edi+48], xmm3 /* write 8 words */
- lea eax, [eax+ecx*4] /* eax= Src */
- movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-Pitch] */
- movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[0] */
-
- punpcklbw xmm0, xmm7 /* expand to words */
- punpcklbw xmm1, xmm7 /* expand to words */
- movdqa [edi+64], xmm0 /* write 8 words */
- movdqa [edi+80], xmm1 /* write 8 words */
- movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[Pitch] */
- movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[2*Pitch] */
- punpcklbw xmm2, xmm7 /* expand to words */
- punpcklbw xmm3, xmm7 /* expand to words */
-
- movdqa [edi+96], xmm2 /* write 8 words */
- movdqa [edi+112], xmm3 /* write 8 words */
- lea eax, [eax+ecx*4] /* eax= Src+4*Pitch */
- movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[3*Pitch] */
- movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[4*Pitch] */
-
- punpcklbw xmm0, xmm7 /* expand to words */
- punpcklbw xmm1, xmm7 /* expand to words */
- movdqa [edi+128], xmm0 /* write 8 words */
- movdqa [edi+144], xmm1 /* write 8 words */
-
- /* done with copying everything to intermediate buffer */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
- /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
-
- pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
- psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
- psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
-
- movdqa xmm2, [edi+16] /* Pixel 1 */
- movdqa xmm6, [edi+80] /* Pixel 5 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
- movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
- pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
-
- movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
- movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
-
- movdqa xmm2, [edi+32] /* Pixel 2 */
- movdqa xmm6, [edi+96] /* Pixel 6 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 2 */
- paddw xmm4, xmm6 /* xmm4 += pixel 6 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
-
- movdqa xmm2, [edi+48] /* Pixel 3 */
- movdqa xmm6, [edi+112] /* Pixel 7 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 3 */
- paddw xmm4, xmm6 /* xmm4 += pixel 7 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
-
- movdqa xmm2, [edi+64] /* Pixel 4 */
- movdqa xmm6, [edi+128] /* Pixel 8 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 4 */
- paddw xmm4, xmm6 /* xmm4 += pixel 8 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
-
- paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
- paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
-
- /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* xmm1 = x1 + x2 + x3 + x4 */
- /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* xmm5 = x5 + x6 + x7 + x8 */
-
- movdqa xmm7, xmm3 /* xmm7 = xmm3 */
- psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
-
- movdqa xmm2, xmm0 /* make copy of sum1 */
- movdqa xmm6, xmm4 /* make copy of sum2 */
-
- paddw xmm0, xmm7 /* (sum1 + 1) */
- paddw xmm4, xmm7 /* (sum2 + 1) */
-
- psraw xmm2, 1 /* sum1 /2 */
- psraw xmm6, 1 /* sum2 /2 */
-
- psraw xmm0, 1 /* (sum1 + 1)/2 */
- psraw xmm4, 1 /* (sum2 + 1)/2 */
-
- pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw xmm1, xmm2 /* Variance 1 */
- psubw xmm5, xmm6 /* Variance 2 */
-
- movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
- movdqa xmm2, xmm1 /* copy of Varinace 1*/
- movdqa [Variance1], xmm1 /* save the varinace1 */
- movdqa [Variance2], xmm5 /* save the varinace2 */
- movdqa xmm6, xmm5 /* Variance 2 */
- psubw xmm1, xmm7 /* Variance 1 < Flimit? */
-
- psubw xmm5, xmm7 /* Variance 2 < Flimit? */
- psraw xmm2, 15 /* Variance 1 > 32768? */
- psraw xmm6, 15 /* Vaiance 2 > 32768? */
- psraw xmm1, 15 /* FFFF/0000 for true/false */
-
- psraw xmm5, 15 /* FFFF/0000 for true/false */
- movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
- pandn xmm2, xmm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn xmm6, xmm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
- pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movdqa xmm2, xmm7 /* make copy of Pixel4 */
- psubusw xmm7, xmm4 /* 4 - 5 */
- psubusw xmm4, xmm2 /* 5 - 4 */
-
- por xmm7, xmm4 /* abs(4 - 5) */
- psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
-
- psraw xmm7, 15 /* FFFF/0000 for True/Flase */
- pand xmm7, xmm6
-
- /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* xmm7 now are in use */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movdqa xmm5, [edi] /* xmm5 = -5 */
- movdqa xmm4, [edi + 16] /* xmm4 = -4 */
-
- movdqa xmm3, xmm4 /* copy of -4 */
- movdqa xmm6, xmm5 /* copy of -5 */
-
- psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
- psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
-
- por xmm4, xmm5 /* abs([-4]-[-5] ) */
- psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm1, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm1, xmm3 /* */
-
- por xmm1, xmm4 /* xmm1 = p1 */
-
- /* now find P2 */
-
- movdqa xmm4, [edi+128] /* xmm4 = [3] */
- movdqa xmm5, [edi+144] /* xmm5 = [4] */
-
- movdqa xmm3, xmm4 /* copy of 3 */
- movdqa xmm6, xmm5 /* copy of 4 */
-
- psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
- psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
-
- por xmm4, xmm5 /* abs([3]-[4] ) */
- psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm2, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm2, xmm3 /* */
-
- por xmm2, xmm4 /* xmm2 = p2 */
- /* Data is ready, now do the filtering */
-
- pxor xmm0, xmm0 /* clear xmm0 */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movdqa xmm3, xmm1 /* xmm3 = p1 */
- paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
-
- paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
- movdqa xmm4, [edi+16] /* xmm4 = x1 */
-
- paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
- paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
-
- paddw xmm3, [edi+64] /* xmm3 += x4 */
- paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
-
- paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
- movdqa xmm4, xmm3 /* xmm4 = xmm3 */
-
- movdqa xmm5, [edi+16] /* xmm5 = x1 */
- paddw xmm4, xmm5 /* xmm4 = sum+x1 */
-
- psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
- psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
-
- paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
- psraw xmm4, 4 /* xmm4 >>=4 */
-
- psubw xmm4, xmm5 /* New Value - old Value */
- pand xmm4, xmm7 /* And the flag */
-
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
-
- movq QWORD PTR [esi+edx*2], xmm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movdqa xmm5, [edi+32] /* xmm5= x2 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+80] /* sum=sum+x5 */
- movdqa xmm4, xmm5 /* copy sum */
-
- paddw xmm4, xmm3 /* xmm4=sum+x2 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
- paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
-
- psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
- movq QWORD PTR [esi+edx], xmm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movdqa xmm5, [edi+48] /* xmm5= x3 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+96] /* sum=sum+x6 */
- movdqa xmm4, xmm5 /* copy x3 */
-
- paddw xmm4, xmm3 /* xmm4=sum+x3 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
- paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
-
- psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
-
- packuswb xmm4, xmm0 /* pack it to bytes */
- movq QWORD PTR [esi],xmm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movdqa xmm5, [edi+64] /* xmm5 = x4 */
- psubw xmm3, xmm1 /* sum = sum-p1 */
-
- paddw xmm3, [edi+112] /* sum = sum+x7 */
- movdqa xmm4, xmm5 /* xmm4 = x4 */
-
- paddw xmm4, xmm3 /* xmm4 = sum + x4 */
- paddw xmm4, xmm4 /* xmm4 *=2 */
-
- paddw xmm4, xmm1 /* += p1 */
- psubw xmm4, [edi+16] /* -= x1 */
-
- psubw xmm4, [edi+112] /* -= x7 */
- paddw xmm4, [edi+128] /* += x8 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x4 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x4 */
-
- packuswb xmm4, xmm0 /* pack it to bytes */
- movq QWORD PTR [esi+ecx], xmm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movdqa xmm5, [edi+80] /* xmm5 = x5 */
- psubw xmm3, [edi+16] /* sum -= x1 */
-
- paddw xmm3, [edi+128] /* sub += x8 */
- movdqa xmm4, xmm5 /* xmm4 = x5 */
-
- paddw xmm4, xmm3 /* xmm4= sum+x5 */
- paddw xmm4, xmm4 /* xmm4 *= 2 */
-
- paddw xmm4, [edi+16] /* += x1 */
- psubw xmm4, [edi+32] /* -= x2 */
-
- psubw xmm4, [edi+128] /* -= x8 */
- paddw xmm4, xmm2 /* += p2 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x5 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x5 */
-
- lea esi, [esi+ecx*4] /* esi=des + 2*pitch */
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi+edx*2], xmm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movdqa xmm5, [edi+96] /* xmm5 = x6 */
- psubw xmm3, [edi+32] /* -= x2 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x6 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x6 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+32] /* +=x2 */
- psubw xmm4, [edi+48] /* -=x3 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x6 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x6 */
-
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi+edx], xmm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movdqa xmm5, [edi+112] /* xmm5 = x7 */
- psubw xmm3, [edi+48] /* -= x3 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x7 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x7 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+48] /* +=x3 */
- psubw xmm4, [edi+64] /* -=x4 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x7 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x7 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi],xmm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movdqa xmm5, [edi+128] /* xmm5 = x8 */
- psubw xmm3, [edi+64] /* -= x4 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x8 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x8 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+64] /* +=x4 */
- psubw xmm4, [edi+80] /* -=x5 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x8 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x8 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi+ecx], xmm4 /* write new x8 */
- pop edi
- pop esi
- pop edx
- pop ecx
- pop eax
- } /* end of the macro */
- Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
- Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
-
- }
- else
- {
- /* copy from src to des */
- __asm
- {
- push esi
- push edi
- push ecx
-
- mov esi, Src /* esi = Src */
- mov edi, Des /* edi = Des */
- push edx
- mov ecx, PlaneLineStep /* ecx = Pitch */
- xor edx, edx /* clear edx */
-
- sub edx, ecx /* edx = -Pitch */
- lea esi, [esi+edx*4] /* esi=Src-4*Pitch*/
-
- movq mm0, [esi] /* first row */
- movq [edi+edx*4], mm0 /* write first row */
-
- lea edi, [edi+edx*4] /* edi=Des-4*Pitch*/
- movq mm1, [esi+ecx] /* Src-3*Pitch */
- movq [edi+ecx], mm1 /* write second row */
- movq mm2, [esi+ecx*2] /* Src-2*Pitch */
- lea esi, [esi+ecx*4] /* Src */
- movq [edi+ecx*2], mm2 /* write third row */
- lea edi, [edi+ecx*4] /* Des */
- movq mm3, [esi+edx] /* Src-Pitch */
-
- movq [edi+edx], mm3 /* write fourth row */
- movq mm4, [esi] /* Src */
- movq mm5, [esi+ecx] /* Src+Pitch */
- movq [edi], mm4 /* write fifth rwo */
- movq mm6, [esi+ecx*2]
- lea esi, [esi+ecx*4] /* Src+pitch*4 */
- movq [edi+ecx], mm5 /* write the sixth rwo */
- movq [edi+ecx*2], mm6 /* write the seventh row */
- movq mm7, [esi+edx]
- lea edi, [edi+ecx*4] /* Des+Pitch*4 */
- movq [edi+edx], mm7 /* write the last row */
- pop edx
- pop ecx
- pop edi
- pop esi
- }
- }
-
- Src += 8;
- Des += 8;
- CurrentFrag ++;
- }
- Des -= ((PlaneLineStep + FragAcross)<<3);
- Des += 8;
- Src = Des;
- CurrentFrag = StartFrag ;
- while(CurrentFrag < StartFrag + FragAcross - 1)
- {
- QStep = QuantScale[pbi->FragQIndex[CurrentFrag+1]];
- if( QStep > 3 )
- {
- QStepWMT[0] = (INT16)QStep;
- QStepWMT[1] = (INT16)QStep;
- QStepWMT[2] = (INT16)QStep;
- QStepWMT[3] = (INT16)QStep;
- QStepWMT[4] = (INT16)QStep;
- QStepWMT[5] = (INT16)QStep;
- QStepWMT[6] = (INT16)QStep;
- QStepWMT[7] = (INT16)QStep;
- for( j=0; j<8;j++)
- {
- Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
- Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
- }
- __asm
- {
- /* Save the registers */
- push eax
- push ecx
- push edx
- push esi
- push edi
-
- /* Calculate the FLimit and store FLimit and QStep */
-
- movdqa xmm0, QStepWMT /* Get QStep */
- movdqa xmm1, EightThrees /* mm1 = 03030303 */
- pmullw xmm1, xmm0 /* mm1 = QStep * 3 */
- pmullw xmm1, xmm0 /* mm1 = QStep * QStep * 3 */
-
- psrlw xmm1, 5 /* mm1 = FLimit */
- movdqa [FLimitWMT], xmm1 /* Save FLimit */
- /* setup the pointers to data */
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
-
- mov esi, Des /* esi = Des */
- sub eax, 4 /* eax = Src-4 */
- sub esi, 4 /* esi = Des-4 */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- sub edx, ecx /* edx = -Pitch */
- lea esi, [esi+ecx*2] /* esi = Des-4 + 2 * Pitch */
-
- /* Get the data to the intermediate buffer */
- movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
- movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
- movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
- lea eax, [eax+ecx*4] /* Go down four Rows */
- movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
- movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
-
- punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
- punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
- movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
- punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
- punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
- movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
- punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
- punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
-
- movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
- punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
- punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
- pxor mm7, mm7 /* clear mm7 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
- movq [edi+16], mm0 /* write 00 10 20 30 */
- punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
- movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
- movq [edi+32], mm5 /* write 01 11 21 31 */
-
- punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
- punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
- movq [edi+48], mm1 /* write 02 12 22 32 */
- movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
-
- movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
- movq [edi+64], mm0 /* write 03 13 23 33 */
- punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
- punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
- movq [edi+80], mm2 /* write 04 14 24 34 */
- punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
- punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
- movq [edi+96], mm3 /* write 05 15 25 35 */
-
- movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
- movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
- movq [edi+112], mm4 /* write 06 16 26 37 */
- movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
- lea eax, [eax+ ecx*4] /* Go down four rows */
- movq [edi+128], mm5 /* write 07 17 27 37 */
- movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
- movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
- punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
- punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
- movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
- punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
- punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
- movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
- punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
- punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
-
- movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
- punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
- punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
- movq [edi+24], mm0 /* write 40 50 60 70 */
- punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
- movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
- movq [edi+40], mm5 /* write 41 51 61 71 */
-
- punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
- punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
- movq [edi+56], mm1 /* write 42 52 62 72 */
- movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
-
- movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
- movq [edi+72], mm0 /* write 43 53 63 73 */
- punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
- punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
- movq [edi+88], mm2 /* write 44 54 64 74 */
- punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
- punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
- movq [edi+104], mm3 /* write 45 55 65 75 */
-
- movq [edi+120], mm4 /* write 46 56 66 76 */
- movq [edi+136], mm5 /* write 47 57 67 77 */
- /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
- /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
-
- pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
- psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
- psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
-
- movdqa xmm2, [edi+16] /* Pixel 1 */
- movdqa xmm6, [edi+80] /* Pixel 5 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
- movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
- pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
-
- movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
- movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
-
- movdqa xmm2, [edi+32] /* Pixel 2 */
- movdqa xmm6, [edi+96] /* Pixel 6 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 2 */
- paddw xmm4, xmm6 /* xmm4 += pixel 6 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
-
- movdqa xmm2, [edi+48] /* Pixel 3 */
- movdqa xmm6, [edi+112] /* Pixel 7 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 3 */
- paddw xmm4, xmm6 /* xmm4 += pixel 7 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
-
- movdqa xmm2, [edi+64] /* Pixel 4 */
- movdqa xmm6, [edi+128] /* Pixel 8 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 4 */
- paddw xmm4, xmm6 /* xmm4 += pixel 8 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
-
- paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
- paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
-
- /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* xmm1 = x1 + x2 + x3 + x4 */
- /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* xmm5 = x5 + x6 + x7 + x8 */
-
- movdqa xmm7, xmm3 /* xmm7 = xmm3 */
- psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
-
- movdqa xmm2, xmm0 /* make copy of sum1 */
- movdqa xmm6, xmm4 /* make copy of sum2 */
-
- paddw xmm0, xmm7 /* (sum1 + 1) */
- paddw xmm4, xmm7 /* (sum2 + 1) */
-
- psraw xmm2, 1 /* sum1 /2 */
- psraw xmm6, 1 /* sum2 /2 */
-
- psraw xmm0, 1 /* (sum1 + 1)/2 */
- psraw xmm4, 1 /* (sum2 + 1)/2 */
-
- pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw xmm1, xmm2 /* Variance 1 */
- psubw xmm5, xmm6 /* Variance 2 */
-
- movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
- movdqa xmm2, xmm1 /* copy of Varinace 1*/
- movdqa [Variance1], xmm1 /* save the varinace1 */
- movdqa [Variance2], xmm5 /* save the varinace2 */
- movdqa xmm6, xmm5 /* Variance 2 */
- psubw xmm1, xmm7 /* Variance 1 < Flimit? */
-
- psubw xmm5, xmm7 /* Variance 2 < Flimit? */
- psraw xmm2, 15 /* Variance 1 > 32768? */
- psraw xmm6, 15 /* Vaiance 2 > 32768? */
- psraw xmm1, 15 /* FFFF/0000 for true/false */
-
- psraw xmm5, 15 /* FFFF/0000 for true/false */
- movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
- pandn xmm2, xmm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn xmm6, xmm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
- pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movdqa xmm2, xmm7 /* make copy of Pixel4 */
- psubusw xmm7, xmm4 /* 4 - 5 */
- psubusw xmm4, xmm2 /* 5 - 4 */
-
- por xmm7, xmm4 /* abs(4 - 5) */
- psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
-
- psraw xmm7, 15 /* FFFF/0000 for True/Flase */
- pand xmm7, xmm6
-
- /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* xmm7 now are in use */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movdqa xmm5, [edi] /* xmm5 = -5 */
- movdqa xmm4, [edi + 16] /* xmm4 = -4 */
-
- movdqa xmm3, xmm4 /* copy of -4 */
- movdqa xmm6, xmm5 /* copy of -5 */
-
- psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
- psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
-
- por xmm4, xmm5 /* abs([-4]-[-5] ) */
- psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm1, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm1, xmm3 /* */
-
- por xmm1, xmm4 /* xmm1 = p1 */
-
- /* now find P2 */
-
- movdqa xmm4, [edi+128] /* xmm4 = [3] */
- movdqa xmm5, [edi+144] /* xmm5 = [4] */
-
- movdqa xmm3, xmm4 /* copy of 3 */
- movdqa xmm6, xmm5 /* copy of 4 */
-
- psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
- psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
-
- por xmm4, xmm5 /* abs([3]-[4] ) */
- psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm2, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm2, xmm3 /* */
-
- por xmm2, xmm4 /* xmm2 = p2 */
- /* Data is ready, now do the filtering */
-
- pxor xmm0, xmm0 /* clear xmm0 */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movdqa xmm3, xmm1 /* xmm3 = p1 */
- paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
-
- paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
- movdqa xmm4, [edi+16] /* xmm4 = x1 */
-
- paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
- paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
-
- paddw xmm3, [edi+64] /* xmm3 += x4 */
- paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
-
- paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
- movdqa xmm4, xmm3 /* xmm4 = xmm3 */
-
- movdqa xmm5, [edi+16] /* xmm5 = x1 */
- paddw xmm4, xmm5 /* xmm4 = sum+x1 */
-
- psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
- psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
-
- paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
- psraw xmm4, 4 /* xmm4 >>=4 */
-
- psubw xmm4, xmm5 /* New Value - old Value */
- pand xmm4, xmm7 /* And the flag */
-
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
-
- movdq2q mm0, xmm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movdqa xmm5, [edi+32] /* xmm5= x2 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+80] /* sum=sum+x5 */
- movdqa xmm4, xmm5 /* copy sum */
-
- paddw xmm4, xmm3 /* xmm4=sum+x2 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
- paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
-
- psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
- movdq2q mm1, xmm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movdqa xmm5, [edi+48] /* xmm5= x3 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+96] /* sum=sum+x6 */
- movdqa xmm4, xmm5 /* copy x3 */
-
- paddw xmm4, xmm3 /* xmm4=sum+x3 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
- paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
-
- psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
-
- packuswb xmm4, xmm0 /* pack it to bytes */
- movdq2q mm2, xmm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movdqa xmm5, [edi+64] /* xmm5 = x4 */
- psubw xmm3, xmm1 /* sum = sum-p1 */
-
- paddw xmm3, [edi+112] /* sum = sum+x7 */
- movdqa xmm4, xmm5 /* xmm4 = x4 */
-
- paddw xmm4, xmm3 /* xmm4 = sum + x4 */
- paddw xmm4, xmm4 /* xmm4 *=2 */
-
- paddw xmm4, xmm1 /* += p1 */
- psubw xmm4, [edi+16] /* -= x1 */
-
- psubw xmm4, [edi+112] /* -= x7 */
- paddw xmm4, [edi+128] /* += x8 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x4 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x4 */
-
- packuswb xmm4, xmm0 /* pack it to bytes */
- movdq2q mm3, xmm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movdqa xmm5, [edi+80] /* xmm5 = x5 */
- psubw xmm3, [edi+16] /* sum -= x1 */
-
- paddw xmm3, [edi+128] /* sub += x8 */
- movdqa xmm4, xmm5 /* xmm4 = x5 */
-
- paddw xmm4, xmm3 /* xmm4= sum+x5 */
- paddw xmm4, xmm4 /* xmm4 *= 2 */
-
- paddw xmm4, [edi+16] /* += x1 */
- psubw xmm4, [edi+32] /* -= x2 */
-
- psubw xmm4, [edi+128] /* -= x8 */
- paddw xmm4, xmm2 /* += p2 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x5 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x5 */
-
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm4, xmm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movdqa xmm5, [edi+96] /* xmm5 = x6 */
- psubw xmm3, [edi+32] /* -= x2 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x6 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x6 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+32] /* +=x2 */
- psubw xmm4, [edi+48] /* -=x3 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x6 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x6 */
-
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm5, xmm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movdqa xmm5, [edi+112] /* xmm5 = x7 */
- psubw xmm3, [edi+48] /* -= x3 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x7 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x7 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+48] /* +=x3 */
- psubw xmm4, [edi+64] /* -=x4 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x7 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x7 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm6, xmm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movdqa xmm5, [edi+128] /* xmm5 = x8 */
- psubw xmm3, [edi+64] /* -= x4 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x8 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x8 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+64] /* +=x4 */
- psubw xmm4, [edi+80] /* -=x5 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x8 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x8 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm7, xmm4 /* write new x8 */
- /* transpose */
- movq2dq xmm0, mm0 /* xmm0 = 70 60 50 40 30 20 10 00 */
- movq2dq xmm1, mm1 /* xmm1 = 71 61 51 41 31 21 11 01 */
- movq2dq xmm2, mm2 /* xmm2 = 72 62 52 42 32 22 12 02 */
- movq2dq xmm3, mm3 /* xmm3 = 73 63 53 43 33 23 13 03 */
- punpcklbw xmm0, xmm1 /* xmm0 = 7170 6160 5150 4140 3130 2120 1110 0100 */
- punpcklbw xmm2, xmm3 /* xmm2 = 7372 6362 5352 4342 3332 2322 1312 0302 */
- movdqa xmm1, xmm0 /* xmm1 = 7170 6160 5150 4140 3130 2120 1110 0100 */
- punpcklwd xmm0, xmm2 /* xmm0 = 33323130 23222120 13121110 03020100 */
- punpckhwd xmm1, xmm2 /* xmm1 = 73727170 63626160 53525150 43424140 */
-
- movq2dq xmm4, mm4 /* xmm4 = 74 64 54 44 34 24 14 04 */
- movq2dq xmm5, mm5 /* xmm5 = 75 65 55 45 35 25 15 05 */
- movq2dq xmm6, mm6 /* xmm6 = 76 66 56 46 36 26 16 06 */
- movq2dq xmm7, mm7 /* xmm7 = 77 67 57 47 37 27 17 07 */
-
- punpcklbw xmm4, xmm5 /* xmm4 = 7574 6564 5554 4544 3534 2524 1514 0504 */
- punpcklbw xmm6, xmm7 /* xmm6 = 7776 6766 5756 4746 3736 2726 1716 0706 */
- movdqa xmm5, xmm4 /* xmm5 = 7574 6564 5554 4544 3534 2524 1514 0504 */
- punpcklwd xmm4, xmm6 /* xmm4 = 37363534 27262524 17161514 07060504 */
- punpckhwd xmm5, xmm6 /* xmm5 = 77767574 67666564 57565554 47464544 */
- movdqa xmm2, xmm0 /* xmm2 = 33323130 23222120 13121110 03020100 */
- punpckldq xmm0, xmm4 /* xmm0 = 1716151413121110 0706050403020100 */
- movq QWORD PTR [esi+edx*2],xmm0 /* write 00 01 02 03 04 05 06 07 */
- psrldq xmm0, 8 /* xmm0 = 1716151413121110 */
- punpckhdq xmm2, xmm4 /* xmm2 = 3736353433323130 2726252423222120 */
- movq QWORD PTR [esi+edx], xmm0 /* write 10 11 12 13 14 15 16 17 */
- movdqa xmm3, xmm1 /* xmm3 = 73727170 63626160 53525150 43424140 */
-
- punpckldq xmm1, xmm5 /* xmm1 = 5756555453525150 4746454443424140 */
- movq QWORD PTR [esi], xmm2 /* write 20 21 22 23 24 25 26 27 */
-
- psrldq xmm2, 8 /* xmm2 = 3736353433323130 */
- punpckhdq xmm3, xmm5 /* xmm3 = 7776757473727170 6766656463626160 */
- movq QWORD PTR [esi+ecx], xmm2 /* write 30 31 32 33 34 35 36 37 */
- lea esi, [esi+ecx*4] /* esi= Des - 4 + 4 *pitch */
-
- movq QWORD PTR [esi+edx*2], xmm1 /* write 40 41 42 43 44 45 46 47 */
- movq QWORD PTR [esi], xmm3 /* write 60 61 62 63 64 65 66 67 */
- psrldq xmm1, 8 /* xmm1 = 5756555453525150 */
- psrldq xmm3, 8 /* xmm3 = 7776757473727170 */
- movq QWORD PTR [esi+edx], xmm1 /* write 50 51 52 53 54 55 56 57 */
- movq QWORD PTR [esi+ecx], xmm3 /* write 70 71 72 73 74 75 76 77 */
- pop edi
- pop esi
- pop edx
- pop ecx
- pop eax
- }// end of __asm
- Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
- Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- pbi->FragmentVariances[CurrentFrag + 1] += Var2;
- }// end of if
- CurrentFrag ++;
- Src += 8;
- Des += 8;
- }//end of while
- #endif
- }
- /****************************************************************************
- *
- * ROUTINE : DeblockNonFilteredBand_WMT
- *
- * INPUTS : None
- *
- * OUTPUTS : None
- *
- * RETURNS : None
- *
- * FUNCTION : Filter both horizontal and vertical edge in a band
- *
- * SPECIAL NOTES :
- *
- * REFERENCE :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void DeblockNonFilteredBand_WMT(
- POSTPROC_INSTANCE *pbi,
- UINT8 *SrcPtr,
- UINT8 *DesPtr,
- UINT32 PlaneLineStep,
- UINT32 FragAcross,
- UINT32 StartFrag,
- UINT32 *QuantScale
- )
- {
- UINT32 j;
- UINT32 CurrentFrag=StartFrag;
- UINT32 QStep;
- UINT32 LoopFLimit;
- UINT8 *Src, *Des;
- UINT32 Var1, Var2;
- #if defined(_WIN32_WCE)
- return;
- #else
- __declspec(align(16)) short QStepWMT[8];
- __declspec(align(16)) short FLimitWMT[8];
- __declspec(align(16)) short Rows[80];
- __declspec(align(16)) short LoopFLimitWMT[8];
- __declspec(align(16)) short LoopFilteredValuesUp[8];
- __declspec(align(16)) short LoopFilteredValuesDown[8];
- __declspec(align(16)) unsigned short Variance1[8];
- __declspec(align(16)) unsigned short Variance2[8];
- LoopFLimit = DeblockLimitValuesV2[pbi->FrameQIndex];
- LoopFLimitWMT[0] = (INT16)LoopFLimit;
- LoopFLimitWMT[1] = (INT16)LoopFLimit;
- LoopFLimitWMT[2] = (INT16)LoopFLimit;
- LoopFLimitWMT[3] = (INT16)LoopFLimit;
- LoopFLimitWMT[4] = (INT16)LoopFLimit;
- LoopFLimitWMT[5] = (INT16)LoopFLimit;
- LoopFLimitWMT[6] = (INT16)LoopFLimit;
- LoopFLimitWMT[7] = (INT16)LoopFLimit;
- while(CurrentFrag < StartFrag + FragAcross )
- {
- Src=SrcPtr+8*(CurrentFrag-StartFrag);
- Des=DesPtr+8*(CurrentFrag-StartFrag);
- QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
- __asm
- {
-
- push eax
- push ecx
- push edx
- push esi
- push edi
-
- /* Calculate the FLimit and store FLimit and QStep */
- /* Copy the data to the intermediate buffer */
- mov eax, QStep
- xor edx, edx /* clear edx */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- pcmpeqw xmm6, xmm6 /* xmm6 = FFFFFF... */
-
-
- movd mm5, eax /* mm5 = QStep */
- psrlw xmm6, 14 /* xmm6 = 3, 3, 3, 3, 3, 3, 3, 3*/
-
- punpcklwd mm5, mm5 /* mm5 = QQ */
- mov eax, Src /* eax = Src */
-
- punpckldq mm5, mm5 /* mm5 = QQQQ */
- sub edx, ecx /* edx = - Pitch */
-
- movq2dq xmm5, mm5 /* xmm5 = QQQQ */
- punpcklqdq xmm5, xmm5 /* xmm5 = QQQQQQQQ */
-
- pmullw xmm6, xmm5 /* Qstep * 3 */
- movdqa QStepWMT, xmm5
-
- lea edi, Rows /* edi = Rows */
- pxor xmm7, xmm7 /* Clear mm7 */
- mov esi, Des /* esi = des */
- pmullw xmm6, xmm5
-
- lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
- lea esi, [esi + edx * 2] /* esi = Des - 2*Pitch */
- psraw xmm6, 5
- movdqa FLimitWMT, xmm6
- /* Copy the data to the intermediate buffer */
-
- movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-5*Pitch] */
- movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[-4*Pitch */
-
- punpcklbw xmm0, xmm7 /* expand to words */
- punpcklbw xmm1, xmm7 /* expand to words */
- movdqa [edi], xmm0 /* write 8 words */
- movdqa [edi+16], xmm1 /* write 8 words */
- movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[-3*Pitch] */
- movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[-2*Pitch] */
- punpcklbw xmm2, xmm7 /* expand to words */
- punpcklbw xmm3, xmm7 /* expand to words */
-
- movdqa [edi+32], xmm2 /* write 8 words */
- movdqa [edi+48], xmm3 /* write 8 words */
- lea eax, [eax+ecx*4] /* eax= Src */
- movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[-Pitch] */
- movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[0] */
-
- punpcklbw xmm0, xmm7 /* expand to words */
- punpcklbw xmm1, xmm7 /* expand to words */
- movdqa [edi+64], xmm0 /* write 8 words */
- movdqa [edi+80], xmm1 /* write 8 words */
- movq xmm2, QWORD PTR [eax+ecx] /* xmm2 = Src[Pitch] */
- movq xmm3, QWORD PTR [eax+ecx*2]/* xmm3 = Src[2*Pitch] */
- punpcklbw xmm2, xmm7 /* expand to words */
- punpcklbw xmm3, xmm7 /* expand to words */
-
- movdqa [edi+96], xmm2 /* write 8 words */
- movdqa [edi+112], xmm3 /* write 8 words */
- lea eax, [eax+ecx*4] /* eax= Src+4*Pitch */
- movq xmm0, QWORD PTR [eax + edx]/* xmm0 = Src[3*Pitch] */
- movq xmm1, QWORD PTR [eax ] /* xmm1 = Src[4*Pitch] */
-
- punpcklbw xmm0, xmm7 /* expand to words */
- punpcklbw xmm1, xmm7 /* expand to words */
- movdqa [edi+128], xmm0 /* write 8 words */
- movdqa [edi+144], xmm1 /* write 8 words */
-
- /* done with copying everything to intermediate buffer */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
- /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
-
- pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
- psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
- psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
-
- movdqa xmm2, [edi+16] /* Pixel 1 */
- movdqa xmm6, [edi+80] /* Pixel 5 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
- movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
- pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
-
- movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
- movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
-
- movdqa xmm2, [edi+32] /* Pixel 2 */
- movdqa xmm6, [edi+96] /* Pixel 6 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 2 */
- paddw xmm4, xmm6 /* xmm4 += pixel 6 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
-
- movdqa xmm2, [edi+48] /* Pixel 3 */
- movdqa xmm6, [edi+112] /* Pixel 7 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 3 */
- paddw xmm4, xmm6 /* xmm4 += pixel 7 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
-
- movdqa xmm2, [edi+64] /* Pixel 4 */
- movdqa xmm6, [edi+128] /* Pixel 8 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 4 */
- paddw xmm4, xmm6 /* xmm4 += pixel 8 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
-
- paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
- paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
-
- /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* xmm1 = x1 + x2 + x3 + x4 */
- /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* xmm5 = x5 + x6 + x7 + x8 */
-
- movdqa xmm7, xmm3 /* xmm7 = xmm3 */
- psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
-
- movdqa xmm2, xmm0 /* make copy of sum1 */
- movdqa xmm6, xmm4 /* make copy of sum2 */
-
- paddw xmm0, xmm7 /* (sum1 + 1) */
- paddw xmm4, xmm7 /* (sum2 + 1) */
-
- psraw xmm2, 1 /* sum1 /2 */
- psraw xmm6, 1 /* sum2 /2 */
-
- psraw xmm0, 1 /* (sum1 + 1)/2 */
- psraw xmm4, 1 /* (sum2 + 1)/2 */
-
- pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw xmm1, xmm2 /* Variance 1 */
- psubw xmm5, xmm6 /* Variance 2 */
-
- movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
- movdqa xmm2, xmm1 /* copy of Varinace 1*/
- movdqa [Variance1], xmm1 /* save the varinace1 */
- movdqa [Variance2], xmm5 /* save the varinace2 */
- movdqa xmm6, xmm5 /* Variance 2 */
- psubw xmm1, xmm7 /* Variance 1 < Flimit? */
-
- psubw xmm5, xmm7 /* Variance 2 < Flimit? */
- psraw xmm2, 15 /* Variance 1 > 32768? */
- psraw xmm6, 15 /* Vaiance 2 > 32768? */
- psraw xmm1, 15 /* FFFF/0000 for true/false */
-
- psraw xmm5, 15 /* FFFF/0000 for true/false */
- movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
- pandn xmm2, xmm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn xmm6, xmm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
- pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movdqa xmm2, xmm7 /* make copy of Pixel4 */
- psubusw xmm7, xmm4 /* 4 - 5 */
- psubusw xmm4, xmm2 /* 5 - 4 */
-
- por xmm7, xmm4 /* abs(4 - 5) */
- psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
-
- psraw xmm7, 15 /* FFFF/0000 for True/Flase */
- pand xmm7, xmm6
-
- /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* xmm7 now are in use */
-
- /* find the loop filtered values for the pixels on block boundary */
- movdqa xmm1, LoopFLimitWMT; /* Get the Flimit values for loop filter */
- movdqa xmm3, [edi + 48] /* xmm3 = x3 = p[-2] */
- movdqa xmm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movdqa xmm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movdqa xmm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw xmm5, xmm4 /* mm5 = p[ 0] - p[-1] */
- psubw xmm3, xmm6 /* mm3 = p[-2] - p[ 1] */
- movdqa xmm4, xmm5 /* make a copy */
- paddw xmm4, xmm5 /* 2 * ( p[0] - p[-1] ) */
- paddw xmm3, EightFours /* mm3 + 4 */
- paddw xmm5, xmm4 /* 3 * ( p[0] - p[-1] ) */
- paddw xmm3, xmm5 /* Filtval before shift */
- psraw xmm3, 3 /* FiltVal */
- movdqa xmm2, xmm3 /* make a copy */
- psraw xmm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor xmm2, xmm3
- psubsw xmm2, xmm3 /* mm2 = abs(FiltVal) */
- por xmm3, EightOnes /* -1 and 1 for + and - */
- movdqa xmm4, xmm1 /* make a copy of Flimit */
- psubw xmm1, xmm2 /* mm1= Flimit - abs(FiltVal) */
- movdqa xmm5, xmm1 /* copy Flimit - abs(FiltVal) */
- psraw xmm1, 15 /* FFFF or 0000 */
- pxor xmm5, xmm1
- psubsw xmm5, xmm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw xmm4, xmm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw xmm4, xmm3 /* get the sign back */
- movdqa xmm1, [edi+64] /* p[-1] */
- movdqa xmm2, [edi+80] /* p[0] */
-
- paddw xmm1, mm4 /* p[-1] + NewFiltVal */
- psubw xmm2, mm4 /* p[0] - NewFiltVal */
- pxor xmm6, xmm6 /* clear mm6 */
- packuswb xmm1, xmm1 /* clamping */
- packuswb xmm2, xmm2
- punpcklbw xmm1, xmm6 /* unpack to word */
-
- movdqa LoopFilteredValuesUp, xmm1 /* save the values */
- punpcklbw xmm2, xmm6 /* unpack to word */
- movdqa LoopFilteredValuesDown, xmm2 /* save the values */
-
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movdqa xmm5, [edi] /* xmm5 = -5 */
- movdqa xmm4, [edi + 16] /* xmm4 = -4 */
-
- movdqa xmm3, xmm4 /* copy of -4 */
- movdqa xmm6, xmm5 /* copy of -5 */
-
- psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
- psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
-
- por xmm4, xmm5 /* abs([-4]-[-5] ) */
- psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm1, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm1, xmm3 /* */
-
- por xmm1, xmm4 /* xmm1 = p1 */
-
- /* now find P2 */
-
- movdqa xmm4, [edi+128] /* xmm4 = [3] */
- movdqa xmm5, [edi+144] /* xmm5 = [4] */
-
- movdqa xmm3, xmm4 /* copy of 3 */
- movdqa xmm6, xmm5 /* copy of 4 */
-
- psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
- psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
-
- por xmm4, xmm5 /* abs([3]-[4] ) */
- psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm2, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm2, xmm3 /* */
-
- por xmm2, xmm4 /* xmm2 = p2 */
- /* Data is ready, now do the filtering */
-
- pxor xmm0, xmm0 /* clear xmm0 */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movdqa xmm3, xmm1 /* xmm3 = p1 */
- paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
-
- paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
- movdqa xmm4, [edi+16] /* xmm4 = x1 */
-
- paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
- paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
-
- paddw xmm3, [edi+64] /* xmm3 += x4 */
- paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
-
- paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
- movdqa xmm4, xmm3 /* xmm4 = xmm3 */
-
- movdqa xmm5, [edi+16] /* xmm5 = x1 */
- paddw xmm4, xmm5 /* xmm4 = sum+x1 */
-
- psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
- psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
-
- paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
- psraw xmm4, 4 /* xmm4 >>=4 */
-
- psubw xmm4, xmm5 /* New Value - old Value */
- pand xmm4, xmm7 /* And the flag */
-
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
-
- movq QWORD PTR [esi+edx*2], xmm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movdqa xmm5, [edi+32] /* xmm5= x2 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+80] /* sum=sum+x5 */
- movdqa xmm4, xmm5 /* copy sum */
-
- paddw xmm4, xmm3 /* xmm4=sum+x2 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
- paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
-
- psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
- movq QWORD PTR [esi+edx], xmm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movdqa xmm5, [edi+48] /* xmm5= x3 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+96] /* sum=sum+x6 */
- movdqa xmm4, xmm5 /* copy x3 */
-
- paddw xmm4, xmm3 /* xmm4=sum+x3 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
- paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
-
- psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
-
- packuswb xmm4, xmm0 /* pack it to bytes */
- movq QWORD PTR [esi],xmm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movdqa xmm5, [edi+64] /* xmm5 = x4 */
- psubw xmm3, xmm1 /* sum = sum-p1 */
-
- paddw xmm3, [edi+112] /* sum = sum+x7 */
- movdqa xmm4, xmm5 /* xmm4 = x4 */
-
- paddw xmm4, xmm3 /* xmm4 = sum + x4 */
- paddw xmm4, xmm4 /* xmm4 *=2 */
-
- paddw xmm4, xmm1 /* += p1 */
- psubw xmm4, [edi+16] /* -= x1 */
-
- psubw xmm4, [edi+112] /* -= x7 */
- paddw xmm4, [edi+128] /* += x8 */
-
- movdqa xmm5, LoopFilteredValuesUp /* Read the loop filtered value of x4 */
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x4 */
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x4 */
- packuswb xmm4, xmm0 /* pack it to bytes */
- movq QWORD PTR [esi+ecx], xmm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movdqa xmm5, [edi+80] /* xmm5 = x5 */
- psubw xmm3, [edi+16] /* sum -= x1 */
-
- paddw xmm3, [edi+128] /* sub += x8 */
- movdqa xmm4, xmm5 /* xmm4 = x5 */
-
- paddw xmm4, xmm3 /* xmm4= sum+x5 */
- paddw xmm4, xmm4 /* xmm4 *= 2 */
-
- paddw xmm4, [edi+16] /* += x1 */
- psubw xmm4, [edi+32] /* -= x2 */
-
- psubw xmm4, [edi+128] /* -= x8 */
- paddw xmm4, xmm2 /* += p2 */
-
- movdqa xmm5, LoopFilteredValuesDown /* Read the loop filtered value of x5 */
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x5 */
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x5 */
- lea esi, [esi+ecx*4] /* esi=des + 2*pitch */
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi+edx*2], xmm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movdqa xmm5, [edi+96] /* xmm5 = x6 */
- psubw xmm3, [edi+32] /* -= x2 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x6 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x6 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+32] /* +=x2 */
- psubw xmm4, [edi+48] /* -=x3 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x6 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x6 */
-
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi+edx], xmm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movdqa xmm5, [edi+112] /* xmm5 = x7 */
- psubw xmm3, [edi+48] /* -= x3 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x7 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x7 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+48] /* +=x3 */
- psubw xmm4, [edi+64] /* -=x4 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x7 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x7 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi],xmm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movdqa xmm5, [edi+128] /* xmm5 = x8 */
- psubw xmm3, [edi+64] /* -= x4 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x8 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x8 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+64] /* +=x4 */
- psubw xmm4, [edi+80] /* -=x5 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x8 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x8 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movq QWORD PTR [esi+ecx], xmm4 /* write new x8 */
- pop edi
- pop esi
- pop edx
- pop ecx
- pop eax
-
- } /* end of the macro */
-
- Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
- Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
-
- if(CurrentFrag==StartFrag)
- CurrentFrag++;
- else
- {
-
- Des=DesPtr-8*PlaneLineStep+8*(CurrentFrag-StartFrag);
- Src=Des;
- QStep = QuantScale[pbi->FragQIndex[CurrentFrag]];
- QStepWMT[0] = (INT16)QStep;
- QStepWMT[1] = (INT16)QStep;
- QStepWMT[2] = (INT16)QStep;
- QStepWMT[3] = (INT16)QStep;
- QStepWMT[4] = (INT16)QStep;
- QStepWMT[5] = (INT16)QStep;
- QStepWMT[6] = (INT16)QStep;
- QStepWMT[7] = (INT16)QStep;
- for( j=0; j<8;j++)
- {
- Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
- Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
- }
- __asm
- {
- /* Save the registers */
- push eax
- push ecx
- push edx
- push esi
- push edi
-
- /* Calculate the FLimit and store FLimit and QStep */
-
- movdqa xmm0, QStepWMT /* Get QStep */
- movdqa xmm1, EightThrees /* mm1 = 03030303 */
- pmullw xmm1, xmm0 /* mm1 = QStep * 3 */
- pmullw xmm1, xmm0 /* mm1 = QStep * QStep * 3 */
-
- psrlw xmm1, 5 /* mm1 = FLimit */
- movdqa [FLimitWMT], xmm1 /* Save FLimit */
- /* setup the pointers to data */
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
-
- mov esi, Des /* esi = Des */
- sub eax, 4 /* eax = Src-4 */
- sub esi, 4 /* esi = Des-4 */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- sub edx, ecx /* edx = -Pitch */
- lea esi, [esi+ecx*2] /* esi = Des-4 + 2 * Pitch */
-
- /* Get the data to the intermediate buffer */
- movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
- movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
- movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
- lea eax, [eax+ecx*4] /* Go down four Rows */
- movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
- movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
-
- punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
- punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
- movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
- punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
- punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
- movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
- punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
- punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
-
- movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
- punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
- punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
- pxor mm7, mm7 /* clear mm7 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
- movq [edi+16], mm0 /* write 00 10 20 30 */
- punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
- movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
- movq [edi+32], mm5 /* write 01 11 21 31 */
-
- punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
- punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
- movq [edi+48], mm1 /* write 02 12 22 32 */
- movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
-
- movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
- movq [edi+64], mm0 /* write 03 13 23 33 */
- punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
- punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
- movq [edi+80], mm2 /* write 04 14 24 34 */
- punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
- punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
- movq [edi+96], mm3 /* write 05 15 25 35 */
-
- movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
- movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
- movq [edi+112], mm4 /* write 06 16 26 37 */
- movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
- lea eax, [eax+ ecx*4] /* Go down four rows */
- movq [edi+128], mm5 /* write 07 17 27 37 */
- movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
- movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
- punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
- punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
- movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
- punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
- punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
- movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
- punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
- punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
-
- movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
- punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
- punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
- movq [edi+24], mm0 /* write 40 50 60 70 */
- punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
- movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
- movq [edi+40], mm5 /* write 41 51 61 71 */
-
- punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
- punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
- movq [edi+56], mm1 /* write 42 52 62 72 */
- movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
-
- movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
- movq [edi+72], mm0 /* write 43 53 63 73 */
- punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
- punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
- movq [edi+88], mm2 /* write 44 54 64 74 */
- punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
- punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
- movq [edi+104], mm3 /* write 45 55 65 75 */
-
- movq [edi+120], mm4 /* write 46 56 66 76 */
- movq [edi+136], mm5 /* write 47 57 67 77 */
- /* we use xmm0,xmm1,xmm2 for 1234 and xmm4, xmm5, xmm6 for 5-8 */
- /* xmm7 = 0, xmm3 = {128, 128, 128, 128, 128, 128, 128, 128} */
-
- pcmpeqw xmm3, xmm3 /* xmm3 = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF */
- psllw xmm3, 15 /* xmm3 = 80008000800080008000800080008000 */
- psrlw xmm3, 8 /* xmm3 = 00800080008000800080008000800080 */
-
- movdqa xmm2, [edi+16] /* Pixel 1 */
- movdqa xmm6, [edi+80] /* Pixel 5 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- movdqa xmm0, xmm2 /* xmm0 = pixel 1 */
- movdqa xmm4, xmm6 /* xmm4 = pixel 5 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel1 * pixel1 */
- pmullw xmm6, xmm6 /* xmm6 = pixel5 * pixel5 */
-
- movdqa xmm1, xmm2 /* xmm1 = pixel1^2 */
- movdqa xmm5, xmm6 /* xmm5 = pixel5^2 */
-
- movdqa xmm2, [edi+32] /* Pixel 2 */
- movdqa xmm6, [edi+96] /* Pixel 6 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 2 */
- paddw xmm4, xmm6 /* xmm4 += pixel 6 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel2^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel6^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel2^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel6^2 */
-
- movdqa xmm2, [edi+48] /* Pixel 3 */
- movdqa xmm6, [edi+112] /* Pixel 7 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 3 */
- paddw xmm4, xmm6 /* xmm4 += pixel 7 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel3^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel7^2 */
-
- paddw xmm1, xmm2 /* xmm1 += pixel3^2 */
- paddw xmm5, xmm6 /* xmm5 += pixel7^2 */
-
- movdqa xmm2, [edi+64] /* Pixel 4 */
- movdqa xmm6, [edi+128] /* Pixel 8 */
-
- psubw xmm2, xmm3 /* xmm2 -=128 */
- psubw xmm6, xmm3 /* xmm6 -=128 */
-
- paddw xmm0, xmm2 /* xmm0 += pixel 4 */
- paddw xmm4, xmm6 /* xmm4 += pixel 8 */
-
- pmullw xmm2, xmm2 /* xmm2 = pixel4^2 */
- pmullw xmm6, xmm6 /* xmm6 = pixel8^2 */
-
- paddw xmm1, xmm2 /* xmm1 = pixel4^2 */
- paddw xmm5, xmm6 /* xmm5 = pixel8^2 */
-
- /* xmm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* xmm1 = x1 + x2 + x3 + x4 */
- /* xmm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* xmm5 = x5 + x6 + x7 + x8 */
-
- movdqa xmm7, xmm3 /* xmm7 = xmm3 */
- psrlw xmm7, 7 /* xmm7 = 00010001000100010001000100010001 */
-
- movdqa xmm2, xmm0 /* make copy of sum1 */
- movdqa xmm6, xmm4 /* make copy of sum2 */
-
- paddw xmm0, xmm7 /* (sum1 + 1) */
- paddw xmm4, xmm7 /* (sum2 + 1) */
-
- psraw xmm2, 1 /* sum1 /2 */
- psraw xmm6, 1 /* sum2 /2 */
-
- psraw xmm0, 1 /* (sum1 + 1)/2 */
- psraw xmm4, 1 /* (sum2 + 1)/2 */
-
- pmullw xmm2, xmm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw xmm6, xmm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw xmm1, xmm2 /* Variance 1 */
- psubw xmm5, xmm6 /* Variance 2 */
-
- movdqa xmm7, FLimitWMT /* xmm7 = FLimit */
- movdqa xmm2, xmm1 /* copy of Varinace 1*/
- movdqa [Variance1], xmm1 /* save the varinace1 */
- movdqa [Variance2], xmm5 /* save the varinace2 */
- movdqa xmm6, xmm5 /* Variance 2 */
- psubw xmm1, xmm7 /* Variance 1 < Flimit? */
-
- psubw xmm5, xmm7 /* Variance 2 < Flimit? */
- psraw xmm2, 15 /* Variance 1 > 32768? */
- psraw xmm6, 15 /* Vaiance 2 > 32768? */
- psraw xmm1, 15 /* FFFF/0000 for true/false */
-
- psraw xmm5, 15 /* FFFF/0000 for true/false */
- movdqa xmm7, [edi+64] /* xmm0 = Pixel 4 */
- pandn xmm2, xmm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn xmm6, xmm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movdqa xmm4, [edi+80] /* xmm4 = Pixel 5 */
- pand xmm6, xmm2 /* xmm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movdqa xmm2, xmm7 /* make copy of Pixel4 */
- psubusw xmm7, xmm4 /* 4 - 5 */
- psubusw xmm4, xmm2 /* 5 - 4 */
-
- por xmm7, xmm4 /* abs(4 - 5) */
- psubw xmm7, QStepWMT /* abs(4-5)<QStepxmmx ? */
-
- psraw xmm7, 15 /* FFFF/0000 for True/Flase */
- pand xmm7, xmm6
-
- /* xmm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* xmm7 now are in use */
- /* find the loop filtered values for the pixels on block boundary */
- movdqa xmm1, LoopFLimitWMT; /* Get the Flimit values for loop filter */
- movdqa xmm3, [edi + 48] /* xmm3 = x3 = p[-2] */
- movdqa xmm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movdqa xmm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movdqa xmm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw xmm5, xmm4 /* mm5 = p[ 0] - p[-1] */
- psubw xmm3, xmm6 /* mm3 = p[-2] - p[ 1] */
- movdqa xmm4, xmm5 /* make a copy */
- paddw xmm4, xmm5 /* 2 * ( p[0] - p[-1] ) */
- paddw xmm3, EightFours /* mm3 + 4 */
- paddw xmm5, xmm4 /* 3 * ( p[0] - p[-1] ) */
- paddw xmm3, xmm5 /* Filtval before shift */
- psraw xmm3, 3 /* FiltVal */
- movdqa xmm2, xmm3 /* make a copy */
- psraw xmm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor xmm2, xmm3
- psubsw xmm2, xmm3 /* mm2 = abs(FiltVal) */
- por xmm3, EightOnes /* -1 and 1 for + and - */
- movdqa xmm4, xmm1 /* make a copy of Flimit */
- psubw xmm1, xmm2 /* mm1= Flimit - abs(FiltVal) */
- movdqa xmm5, xmm1 /* copy Flimit - abs(FiltVal) */
- psraw xmm1, 15 /* FFFF or 0000 */
- pxor xmm5, xmm1
- psubsw xmm5, xmm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw xmm4, xmm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw xmm4, xmm3 /* get the sign back */
- movdqa xmm1, [edi+64] /* p[-1] */
- movdqa xmm2, [edi+80] /* p[0] */
-
- paddw xmm1, mm4 /* p[-1] + NewFiltVal */
- psubw xmm2, mm4 /* p[0] - NewFiltVal */
- pxor xmm6, xmm6 /* clear mm6 */
- packuswb xmm1, xmm1 /* clamping */
- packuswb xmm2, xmm2
- punpcklbw xmm1, xmm6 /* unpack to word */
-
- movdqa LoopFilteredValuesUp, xmm1 /* save the values */
- punpcklbw xmm2, xmm6 /* unpack to word */
- movdqa LoopFilteredValuesDown, xmm2 /* save the values */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movdqa xmm5, [edi] /* xmm5 = -5 */
- movdqa xmm4, [edi + 16] /* xmm4 = -4 */
-
- movdqa xmm3, xmm4 /* copy of -4 */
- movdqa xmm6, xmm5 /* copy of -5 */
-
- psubusw xmm4, xmm6 /* xmm4 = [-4] - [-5] */
- psubusw xmm5, xmm3 /* xmm5 = [-5] - [-4] */
-
- por xmm4, xmm5 /* abs([-4]-[-5] ) */
- psubw xmm4, QStepWMT /* abs([-4]-[-5] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm1, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm1, xmm3 /* */
-
- por xmm1, xmm4 /* xmm1 = p1 */
-
- /* now find P2 */
-
- movdqa xmm4, [edi+128] /* xmm4 = [3] */
- movdqa xmm5, [edi+144] /* xmm5 = [4] */
-
- movdqa xmm3, xmm4 /* copy of 3 */
- movdqa xmm6, xmm5 /* copy of 4 */
-
- psubusw xmm4, xmm6 /* xmm4 = [3] - [4] */
- psubusw xmm5, xmm3 /* xmm5 = [4] - [3] */
-
- por xmm4, xmm5 /* abs([3]-[4] ) */
- psubw xmm4, QStepWMT /* abs([3]-[4] )<QStep? */
-
- psraw xmm4, 15 /* FFFF/0000 for True/False */
- movdqa xmm2, xmm4 /* copy of the xmm4 */
-
- pand xmm4, xmm6 /* */
- pandn xmm2, xmm3 /* */
-
- por xmm2, xmm4 /* xmm2 = p2 */
- /* Data is ready, now do the filtering */
-
- pxor xmm0, xmm0 /* clear xmm0 */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movdqa xmm3, xmm1 /* xmm3 = p1 */
- paddw xmm3, xmm3 /* xmm3 = p1 + p1 */
-
- paddw xmm3, xmm1 /* xmm3 = p1 + p1 + p1 */
- movdqa xmm4, [edi+16] /* xmm4 = x1 */
-
- paddw xmm3, [edi+32] /* xmm3 = p1+p1+p1+ x2 */
- paddw xmm4, [edi+48] /* xmm4 = x1+x3 */
-
- paddw xmm3, [edi+64] /* xmm3 += x4 */
- paddw xmm4, EightFours /* xmm4 = x1 + x3 + 4 */
-
- paddw xmm3, xmm4 /* xmm3 = 3*p1+x1+x2+x3+x4+4 */
- movdqa xmm4, xmm3 /* xmm4 = xmm3 */
-
- movdqa xmm5, [edi+16] /* xmm5 = x1 */
- paddw xmm4, xmm5 /* xmm4 = sum+x1 */
-
- psllw xmm4, 1 /* xmm4 = (sum+x1)<<1 */
- psubw xmm4, [edi+64] /* xmm4 = (sum+x1)<<1-x4 */
-
- paddw xmm4, [edi+80] /* xmm4 = (sum+x1)<<1-x4+x5 */
- psraw xmm4, 4 /* xmm4 >>=4 */
-
- psubw xmm4, xmm5 /* New Value - old Value */
- pand xmm4, xmm7 /* And the flag */
-
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
-
- movdq2q mm0, xmm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movdqa xmm5, [edi+32] /* xmm5= x2 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+80] /* sum=sum+x5 */
- movdqa xmm4, xmm5 /* copy sum */
-
- paddw xmm4, xmm3 /* xmm4=sum+x2 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+80] /* xmm4 =(sum+x2)<<1-x5 */
- paddw xmm4, [edi+96] /* xmm4 =(sum+x2)<<1-x5+x6 */
-
- psraw xmm4, 4 /* xmm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
- packuswb xmm4, xmm0 /* pack it to bytes */
- movdq2q mm1, xmm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movdqa xmm5, [edi+48] /* xmm5= x3 */
- psubw xmm3, xmm1 /* sum=sum-p1 */
-
- paddw xmm3, [edi+96] /* sum=sum+x6 */
- movdqa xmm4, xmm5 /* copy x3 */
-
- paddw xmm4, xmm3 /* xmm4=sum+x3 */
- paddw xmm4, xmm4 /* xmm4 <<= 1 */
-
- psubw xmm4, [edi+96] /* xmm4 =(sum+x3)<<1-x6 */
- paddw xmm4, [edi+112] /* xmm4 =(sum+x3)<<1-x6+x7 */
-
- psraw xmm4, 4 /* xmm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw xmm4, xmm5 /* new value - old value */
-
- pand xmm4, xmm7 /* And the flag */
- paddw xmm4, xmm5 /* add the old value back */
-
- packuswb xmm4, xmm0 /* pack it to bytes */
- movdq2q mm2, xmm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movdqa xmm5, [edi+64] /* xmm5 = x4 */
- psubw xmm3, xmm1 /* sum = sum-p1 */
-
- paddw xmm3, [edi+112] /* sum = sum+x7 */
- movdqa xmm4, xmm5 /* xmm4 = x4 */
-
- paddw xmm4, xmm3 /* xmm4 = sum + x4 */
- paddw xmm4, xmm4 /* xmm4 *=2 */
-
- paddw xmm4, xmm1 /* += p1 */
- psubw xmm4, [edi+16] /* -= x1 */
-
- psubw xmm4, [edi+112] /* -= x7 */
- paddw xmm4, [edi+128] /* += x8 */
-
- movdqa xmm5, LoopFilteredValuesUp /* Read the loop filtered value of x4 */
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x4 */
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x4 */
- packuswb xmm4, xmm0 /* pack it to bytes */
- movdq2q mm3, xmm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movdqa xmm5, [edi+80] /* xmm5 = x5 */
- psubw xmm3, [edi+16] /* sum -= x1 */
-
- paddw xmm3, [edi+128] /* sub += x8 */
- movdqa xmm4, xmm5 /* xmm4 = x5 */
-
- paddw xmm4, xmm3 /* xmm4= sum+x5 */
- paddw xmm4, xmm4 /* xmm4 *= 2 */
-
- paddw xmm4, [edi+16] /* += x1 */
- psubw xmm4, [edi+32] /* -= x2 */
-
- psubw xmm4, [edi+128] /* -= x8 */
- paddw xmm4, xmm2 /* += p2 */
- movdqa xmm5, LoopFilteredValuesDown /* Read the loop filtered value of x4 */
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x5 */
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x5 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm4, xmm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movdqa xmm5, [edi+96] /* xmm5 = x6 */
- psubw xmm3, [edi+32] /* -= x2 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x6 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x6 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+32] /* +=x2 */
- psubw xmm4, [edi+48] /* -=x3 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x6 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x6 */
-
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm5, xmm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movdqa xmm5, [edi+112] /* xmm5 = x7 */
- psubw xmm3, [edi+48] /* -= x3 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x7 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x7 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+48] /* +=x3 */
- psubw xmm4, [edi+64] /* -=x4 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x7 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x7 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm6, xmm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movdqa xmm5, [edi+128] /* xmm5 = x8 */
- psubw xmm3, [edi+64] /* -= x4 */
-
- paddw xmm3, xmm2 /* += p2 */
- movdqa xmm4, xmm5 /* xmm4 = x8 */
-
- paddw xmm4, xmm3 /* xmm4 = sum+x8 */
- paddw xmm4, xmm4 /* xmm4 *= 2*/
-
- paddw xmm4, [edi+64] /* +=x4 */
- psubw xmm4, [edi+80] /* -=x5 */
-
- psraw xmm4, 4 /* >>=4 */
- psubw xmm4, xmm5 /* -=x8 */
-
- pand xmm4, xmm7 /* and flag */
- paddw xmm4, xmm5 /* += x8 */
- packuswb xmm4, xmm0 /* pack to bytes */
- movdq2q mm7, xmm4 /* write new x8 */
- /* transpose */
- movq2dq xmm0, mm0 /* xmm0 = 70 60 50 40 30 20 10 00 */
- movq2dq xmm1, mm1 /* xmm1 = 71 61 51 41 31 21 11 01 */
- movq2dq xmm2, mm2 /* xmm2 = 72 62 52 42 32 22 12 02 */
- movq2dq xmm3, mm3 /* xmm3 = 73 63 53 43 33 23 13 03 */
- punpcklbw xmm0, xmm1 /* xmm0 = 7170 6160 5150 4140 3130 2120 1110 0100 */
- punpcklbw xmm2, xmm3 /* xmm2 = 7372 6362 5352 4342 3332 2322 1312 0302 */
- movdqa xmm1, xmm0 /* xmm1 = 7170 6160 5150 4140 3130 2120 1110 0100 */
- punpcklwd xmm0, xmm2 /* xmm0 = 33323130 23222120 13121110 03020100 */
- punpckhwd xmm1, xmm2 /* xmm1 = 73727170 63626160 53525150 43424140 */
-
- movq2dq xmm4, mm4 /* xmm4 = 74 64 54 44 34 24 14 04 */
- movq2dq xmm5, mm5 /* xmm5 = 75 65 55 45 35 25 15 05 */
- movq2dq xmm6, mm6 /* xmm6 = 76 66 56 46 36 26 16 06 */
- movq2dq xmm7, mm7 /* xmm7 = 77 67 57 47 37 27 17 07 */
-
- punpcklbw xmm4, xmm5 /* xmm4 = 7574 6564 5554 4544 3534 2524 1514 0504 */
- punpcklbw xmm6, xmm7 /* xmm6 = 7776 6766 5756 4746 3736 2726 1716 0706 */
- movdqa xmm5, xmm4 /* xmm5 = 7574 6564 5554 4544 3534 2524 1514 0504 */
- punpcklwd xmm4, xmm6 /* xmm4 = 37363534 27262524 17161514 07060504 */
- punpckhwd xmm5, xmm6 /* xmm5 = 77767574 67666564 57565554 47464544 */
- movdqa xmm2, xmm0 /* xmm2 = 33323130 23222120 13121110 03020100 */
- punpckldq xmm0, xmm4 /* xmm0 = 1716151413121110 0706050403020100 */
- movq QWORD PTR [esi+edx*2],xmm0 /* write 00 01 02 03 04 05 06 07 */
- psrldq xmm0, 8 /* xmm0 = 1716151413121110 */
- punpckhdq xmm2, xmm4 /* xmm2 = 3736353433323130 2726252423222120 */
- movq QWORD PTR [esi+edx], xmm0 /* write 10 11 12 13 14 15 16 17 */
- movdqa xmm3, xmm1 /* xmm3 = 73727170 63626160 53525150 43424140 */
-
- punpckldq xmm1, xmm5 /* xmm1 = 5756555453525150 4746454443424140 */
- movq QWORD PTR [esi], xmm2 /* write 20 21 22 23 24 25 26 27 */
-
- psrldq xmm2, 8 /* xmm2 = 3736353433323130 */
- punpckhdq xmm3, xmm5 /* xmm3 = 7776757473727170 6766656463626160 */
- movq QWORD PTR [esi+ecx], xmm2 /* write 30 31 32 33 34 35 36 37 */
- lea esi, [esi+ecx*4] /* esi= Des - 4 + 4 *pitch */
-
- movq QWORD PTR [esi+edx*2], xmm1 /* write 40 41 42 43 44 45 46 47 */
- movq QWORD PTR [esi], xmm3 /* write 60 61 62 63 64 65 66 67 */
- psrldq xmm1, 8 /* xmm1 = 5756555453525150 */
- psrldq xmm3, 8 /* xmm3 = 7776757473727170 */
- movq QWORD PTR [esi+edx], xmm1 /* write 50 51 52 53 54 55 56 57 */
- movq QWORD PTR [esi+ecx], xmm3 /* write 70 71 72 73 74 75 76 77 */
- pop edi
- pop esi
- pop edx
- pop ecx
- pop eax
- }// end of __asm
- Var1=Variance1[0]+Variance1[1]+Variance1[2]+Variance1[3]+Variance1[4]+Variance1[5]+Variance1[6]+Variance1[7];
- Var2=Variance2[0]+Variance2[1]+Variance2[2]+Variance2[3]+Variance2[4]+Variance2[5]+Variance2[6]+Variance2[7];
- pbi->FragmentVariances[CurrentFrag-1] += Var1;
- pbi->FragmentVariances[CurrentFrag] += Var2;
- CurrentFrag ++;
- }//else
-
- }//while
- #endif
- }
- /****************************************************************************
- *
- * ROUTINE : PlaneAddNoise_wmt
- *
- * INPUTS : UINT8 *Start starting address of buffer to add gaussian
- * noise to
- * UINT32 Width width of plane
- * UINT32 Height height of plane
- * INT32 Pitch distance between subsequent lines of frame
- * INT32 q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
- void PlaneAddNoise_wmt( UINT8 *Start, UINT32 Width, UINT32 Height, INT32 Pitch, int q)
- {
- unsigned int i;
- INT32 Pitch4 = Pitch * 4;
- const int noiseAmount = 2;
- const int noiseAdder = 2 * noiseAmount + 1;
- #if defined(_WIN32_WCE)
- return;
- #else
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
- char CharDist[300];
- char Rand[2048];
- double sigma;
- // return;
- __asm emms
- sigma = 1 + .8*(63-q) / 63.0;
- // set up a lookup table of 256 entries that matches
- // a gaussian distribution with sigma determined by q.
- //
- {
- double i,sum=0;
- int next,j;
- next=0;
- for(i=-32;i<32;i++)
- {
- int a = (int)(.5+256*gaussian(sigma,0,i));
- if(a)
- {
- for(j=0;j<a;j++)
- {
- CharDist[next+j]=(char) i;
- }
- next = next+j;
- }
- }
- for(next=next;next<256;next++)
- CharDist[next] = 0;
- }
- for(i=0;i<2048;i++)
- {
- Rand[i]=CharDist[rand() & 0xff];
- }
- for(i=0;i<16;i++)
- {
- blackclamp[i]=-CharDist[0];
- whiteclamp[i]=-CharDist[0];
- bothclamp[i]=-2*CharDist[0];
- }
- for(i=0;i<Height;i++)
- {
- UINT8 *Pos = Start + i *Pitch;
- INT8 *Ref = Rand + (rand() & 0xff);
- __asm
- {
- mov ecx, [Width]
- mov esi,Pos
- mov edi,Ref
- xor eax,eax
- nextset:
- movdqu xmm1,[esi+eax] // get the source
- psubusb xmm1,blackclamp // clamp both sides so we don't outrange adding noise
- paddusb xmm1,bothclamp
- psubusb xmm1,whiteclamp
- movdqu xmm2,[edi+eax] // get the noise for this line
- paddb xmm1,xmm2 // add it in
- movdqu [esi+eax],xmm1 // store the result
- add eax,16 // move to the next line
- cmp eax, ecx
- jl nextset
- }
- }
- #endif
- }
|