deblockopt.c 232 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692
  1. /****************************************************************************
  2. *
  3. * Module Title : DeblockOpt.c
  4. *
  5. * Description : Optimized functions for deblocking
  6. *
  7. * AUTHOR : Yaowu Xu
  8. *
  9. *****************************************************************************
  10. * Revision History
  11. *
  12. * 1.04 YWX 21-Mar-02 bug fixed in functions using abs diff criteria
  13. * 1.03 YWX 15-Jun-01 Added new 7 tap filter in deblocking
  14. * 1.02 YWX 02-May-01 Changed to use sum of abs diff to replace variance
  15. * 1.01 YWX 17-Nov-00 Re-arranged loop inside deblockNonFilteredBand()
  16. * 1.00 YWX 02-Nov-00 Configuration baseline from old PPoptfunctions.c
  17. *
  18. *****************************************************************************
  19. */
  20. /****************************************************************************
  21. * Header Frames
  22. *****************************************************************************
  23. */
  24. #ifdef _MSC_VER
  25. #pragma warning(disable:4799)
  26. #pragma warning(disable:4731)
  27. #endif
  28. #define STRICT /* Strict type checking. */
  29. #include "postp.h"
  30. #include <stdio.h>
  31. #include <stdlib.h>
  32. /****************************************************************************
  33. * Module constants.
  34. *****************************************************************************
  35. */
  36. #if defined(_WIN32_WCE)
  37. #pragma pack(16)
  38. static short Eight128s[] = {128, 128, 128, 128,128, 128, 128, 128 };
  39. static short Eight64s[] = {64, 64, 64, 64, 64, 64, 64, 64 };
  40. static short EightThrees[]= {3, 3, 3, 3, 3, 3, 3, 3};
  41. static short EightFours[]= {4, 4, 4, 4, 4, 4, 4, 4};
  42. static short Four128s[] = {128, 128, 128, 128};
  43. static short Four64s[] = {64, 64, 64, 64 };
  44. static short FourThrees[]= {3, 3, 3, 3};
  45. static short FourFours[]= {4, 4, 4, 4};
  46. static short FourOnes[]= { 1, 1, 1, 1};
  47. static unsigned char Eight128c[] = {128, 128, 128, 128,128, 128, 128, 128 };
  48. #pragma pack()
  49. #else
  50. __declspec(align(16)) static short Eight128s[] = {128, 128, 128, 128,128, 128, 128, 128 };
  51. __declspec(align(16)) static short Eight64s[] = {64, 64, 64, 64, 64, 64, 64, 64 };
  52. __declspec(align(16)) static short EightThrees[]= {3, 3, 3, 3, 3, 3, 3, 3};
  53. __declspec(align(16)) static short EightFours[]= {4, 4, 4, 4, 4, 4, 4, 4};
  54. __declspec(align(16)) static short Four128s[] = {128, 128, 128, 128};
  55. __declspec(align(16)) static short Four64s[] = {64, 64, 64, 64 };
  56. __declspec(align(16)) static short FourThrees[]= {3, 3, 3, 3};
  57. __declspec(align(16)) static short FourFours[]= {4, 4, 4, 4};
  58. __declspec(align(16)) static short FourOnes[]= { 1, 1, 1, 1};
  59. __declspec(align(16)) static unsigned char Eight128c[] = {128, 128, 128, 128,128, 128, 128, 128 };
  60. #endif
  61. /****************************************************************************
  62. * Explicit Imports
  63. *****************************************************************************
  64. */
  65. extern UINT32 *DeblockLimitValuesV2;
  66. /****************************************************************************
  67. * Exported Global Variables
  68. *****************************************************************************
  69. */
  70. /****************************************************************************
  71. * Exported Functions
  72. *****************************************************************************
  73. */
  74. extern double gaussian(double sigma, double mu, double x);
  75. /****************************************************************************
  76. * Module Statics
  77. *****************************************************************************
  78. */
  79. /****************************************************************************
  80. *
  81. * ROUTINE : SetupBoundingValueArray_ForMMX
  82. *
  83. * INPUTS :
  84. *
  85. * OUTPUTS : None
  86. *
  87. * RETURNS : None
  88. *
  89. * FUNCTION : Applies a loop filter to the edge pixels of coded blocks.
  90. *
  91. * SPECIAL NOTES :
  92. *
  93. *
  94. * ERRORS : None.
  95. *
  96. ****************************************************************************/
  97. INT32 *SetupDeblockValueArray_ForMMX(POSTPROC_INSTANCE *pbi, INT32 FLimit)
  98. {
  99. INT32 * BoundingValuePtr;
  100. /*
  101. Since the FiltBoundingValue array is currently only used in the generic version, we are going
  102. to reuse this memory for our own purposes.
  103. 2 longs for limit, 2 longs for _4ONES, 2 longs for LFABS_MMX, and 8 longs for temp work storage
  104. */
  105. BoundingValuePtr = (INT32 *)((UINT32)(&pbi->DeblockBoundingValue[256]) & 0xffffffe0);
  106. //expand for mmx code
  107. BoundingValuePtr[0] = BoundingValuePtr[1] = FLimit * 0x00010001;
  108. BoundingValuePtr[2] = BoundingValuePtr[3] = 0x00010001;
  109. BoundingValuePtr[4] = BoundingValuePtr[5] = 0x00040004;
  110. return BoundingValuePtr;
  111. }
  112. /****************************************************************************
  113. *
  114. * ROUTINE : DeblockLoopFilteredBand_MMX
  115. *
  116. * INPUTS : None
  117. *
  118. * OUTPUTS : None
  119. *
  120. * RETURNS : None
  121. *
  122. * FUNCTION : Filter both horizontal and vertical edge in a band
  123. *
  124. * SPECIAL NOTES :
  125. *
  126. * REFERENCE :
  127. *
  128. * ERRORS : None.
  129. *
  130. ****************************************************************************/
  131. void DeblockLoopFilteredBand_MMX(
  132. POSTPROC_INSTANCE *pbi,
  133. UINT8 *SrcPtr,
  134. UINT8 *DesPtr,
  135. UINT32 PlaneLineStep,
  136. UINT32 FragAcross,
  137. UINT32 StartFrag,
  138. UINT32 *QuantScale
  139. )
  140. {
  141. UINT32 j;
  142. UINT32 CurrentFrag=StartFrag;
  143. UINT32 QStep;
  144. UINT8 *Src, *Des;
  145. UINT32 Var1, Var2;
  146. #if defined(_WIN32_WCE)
  147. #pragma pack(16)
  148. short QStepMmx[4];
  149. short FLimitMmx[4];
  150. short Rows[80];
  151. short NewRows[64];
  152. unsigned short Variance11[4];
  153. unsigned short Variance12[4];
  154. unsigned short Variance21[4];
  155. unsigned short Variance22[4];
  156. #pragma pack()
  157. #else
  158. __declspec(align(16)) short QStepMmx[4];
  159. __declspec(align(16)) short FLimitMmx[4];
  160. __declspec(align(16)) short Rows[80];
  161. __declspec(align(16)) short NewRows[64];
  162. __declspec(align(16)) unsigned short Variance11[4];
  163. __declspec(align(16)) unsigned short Variance12[4];
  164. __declspec(align(16)) unsigned short Variance21[4];
  165. __declspec(align(16)) unsigned short Variance22[4];
  166. #endif
  167. Src=SrcPtr;
  168. Des=DesPtr;
  169. while(CurrentFrag < StartFrag + FragAcross )
  170. {
  171. QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
  172. if( QStep > 3 )
  173. {
  174. QStepMmx[0] = (INT16)QStep;
  175. QStepMmx[1] = (INT16)QStep;
  176. QStepMmx[2] = (INT16)QStep;
  177. QStepMmx[3] = (INT16)QStep;
  178. __asm
  179. {
  180. /* Save the registers */
  181. push eax
  182. push ebp
  183. push ecx
  184. push edx
  185. push esi
  186. push edi
  187. /* Calculate the FLimit and store FLimit and QStep */
  188. movq mm0, QStepMmx /* mm0 = QStep */
  189. movq mm1, FourThrees /* mm1 = 03030303 */
  190. pmullw mm1, mm0 /* mm1 = QStep * 3 */
  191. pmullw mm1, mm0 /* mm1 = QStep * QStep * 3 */
  192. psrlw mm1, 5 /* mm1 = FLimit */
  193. movq [FLimitMmx], mm1 /* Save FLimit */
  194. /* Copy the data to the intermediate buffer */
  195. mov eax, Src /* eax = Src */
  196. xor edx, edx /* clear edx */
  197. lea esi, NewRows /* esi = NewRows */
  198. lea edi, Rows /* edi = Rows */
  199. mov ecx, PlaneLineStep /* ecx = Pitch */
  200. pxor mm7, mm7 /* Clear mm7 */
  201. sub edx, ecx /* edx = -Pitch */
  202. lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
  203. movq mm0, [eax + edx] /* mm0 = Src[-5*Pitch] */
  204. movq mm1, mm0 /* mm1 = mm0 */
  205. punpcklbw mm0, mm7 /* Lower Four -5 */
  206. movq mm2, [eax] /* mm2 = Src[-4*Pitch] */
  207. movq mm3, mm2 /* mm3 = mm2 */
  208. punpckhbw mm1, mm7 /* Higher Four -5 */
  209. movq [edi], mm0 /* Write Lower Four of -5 */
  210. punpcklbw mm2, mm7 /* Lower Four -4 */
  211. punpckhbw mm3, mm7 /* higher Four -4 */
  212. movq [edi+8], mm1 /* Write Higher Four of -5 */
  213. movq mm4, [eax + ecx] /* mm4 = Src[-3*Pitch] */
  214. movq [edi+16], mm2 /* Write Lower -4 */
  215. movq [edi+24], mm3 /* write hight -4 */
  216. movq mm5, mm4 /* mm5 = mm4 */
  217. punpcklbw mm4, mm7 /* lower four -3 */
  218. movq mm0, [eax + ecx *2] /* mm0 = Src[-2*Pitch] */
  219. punpckhbw mm5, mm7 /* higher four -3 */
  220. movq mm1, mm0 /* mm1 = mm0 */
  221. movq [edi+32], mm4 /* write Lower -3 */
  222. punpcklbw mm0, mm7 /* lower four -2 */
  223. lea eax, [eax + ecx *4] /* eax = Src */
  224. movq [edi+40], mm5 /* write Higher -3 */
  225. punpckhbw mm1, mm7 /* higher four -2 */
  226. movq mm2, [eax + edx] /* mm2 = Src[-Pitch] */
  227. movq [edi+48], mm0 /* lower -2 */
  228. movq mm3, mm2 /* mm3 = mm2 */
  229. punpcklbw mm2, mm7 /* lower -1 */
  230. movq [edi+56], mm1 /* higher -2 */
  231. punpckhbw mm3, mm7 /* Higher -1 */
  232. movq mm4, [eax] /* mm4 = Src[0] */
  233. movq [edi+64], mm2 /* Lower -1 */
  234. movq mm5, mm4 /* mm5 = mm4 */
  235. movq [edi+72], mm3 /* Higher -1 */
  236. punpcklbw mm4, mm7 /* lower 0 */
  237. punpckhbw mm5, mm7 /* higher 0 */
  238. movq mm0, [eax + ecx] /* mm0 = Src[Pitch] */
  239. movq [edi+80], mm4 /* write lower 0 */
  240. movq mm1, mm0 /* mm1 = mm0 */
  241. movq [edi+88], mm5 /* write higher 0 */
  242. punpcklbw mm0, mm7 /* lower 1 */
  243. punpckhbw mm1, mm7 /* higher 1 */
  244. movq mm2, [eax + ecx *2 ] /* mm2 = Src[2*Pitch] */
  245. lea eax, [eax + ecx *4] /* eax = Src + 4 * Pitch */
  246. movq mm3, mm2 /* mm3 = mm2 */
  247. movq [edi+96], mm0 /* write lower 1 */
  248. punpcklbw mm2, mm7 /* lower 2 */
  249. punpckhbw mm3, mm7 /* higher 2 */
  250. movq mm4, [eax + edx ] /* mm4 = Src[3*pitch] */
  251. movq [edi+104], mm1 /* wirte higher 1 */
  252. movq mm5, mm4 /* mm5 = mm4 */
  253. punpcklbw mm4, mm7 /* Low 3 */
  254. movq [edi+112], mm2 /* write lower 2 */
  255. movq [edi+120], mm3 /* write higher 2 */
  256. movq mm0, [eax] /* mm0 = Src[4*pitch] */
  257. punpckhbw mm5, mm7 /* high 3 */
  258. movq mm1, mm0 /* mm1=mm0 */
  259. movq [edi+128], mm4 /* low 3 */
  260. punpcklbw mm0, mm7 /* low 4 */
  261. punpckhbw mm1, mm7 /* high 4 */
  262. movq [edi+136], mm5 /* high 3 */
  263. movq [edi+144], mm0 /* low 4 */
  264. movq [edi+152], mm1 /* high 4 */
  265. /* done with copying everything to intermediate buffer */
  266. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  267. /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
  268. /* mm7 = 0, mm3 = {128, 128, 128, 128} */
  269. pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
  270. psllw mm3, 15 /* mm3 = 8000800080008000 */
  271. psrlw mm3, 8 /* mm3 = 0080008000800080 */
  272. movq mm2, [edi+16] /* Pixel 1 */
  273. movq mm6, [edi+80] /* Pixel 5 */
  274. psubw mm2, mm3 /* mm2 -=128 */
  275. psubw mm6, mm3 /* mm6 -=128 */
  276. movq mm0, mm2 /* mm0 = pixel 1 */
  277. movq mm4, mm6 /* mm4 = pixel 5 */
  278. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  279. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  280. movq mm1, mm2 /* mm1 = pixel1^2 */
  281. movq mm5, mm6 /* mm5 = pixel5^2 */
  282. movq mm2, [edi+32] /* Pixel 2 */
  283. movq mm6, [edi+96] /* Pixel 6 */
  284. psubw mm2, mm3 /* mm2 -=128 */
  285. psubw mm6, mm3 /* mm6 -=128 */
  286. paddw mm0, mm2 /* mm0 += pixel 2 */
  287. paddw mm4, mm6 /* mm4 += pixel 6 */
  288. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  289. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  290. paddw mm1, mm2 /* mm1 += pixel2^2 */
  291. paddw mm5, mm6 /* mm5 += pixel6^2 */
  292. movq mm2, [edi+48] /* Pixel 3 */
  293. movq mm6, [edi+112] /* Pixel 7 */
  294. psubw mm2, mm3 /* mm2 -=128 */
  295. psubw mm6, mm3 /* mm6 -=128 */
  296. paddw mm0, mm2 /* mm0 += pixel 3 */
  297. paddw mm4, mm6 /* mm4 += pixel 7 */
  298. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  299. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  300. paddw mm1, mm2 /* mm1 += pixel3^2 */
  301. paddw mm5, mm6 /* mm5 += pixel7^2 */
  302. movq mm2, [edi+64] /* Pixel 4 */
  303. movq mm6, [edi+128] /* Pixel 8 */
  304. psubw mm2, mm3 /* mm2 -=128 */
  305. psubw mm6, mm3 /* mm6 -=128 */
  306. paddw mm0, mm2 /* mm0 += pixel 4 */
  307. paddw mm4, mm6 /* mm4 += pixel 8 */
  308. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  309. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  310. paddw mm1, mm2 /* mm1 = pixel4^2 */
  311. paddw mm5, mm6 /* mm5 = pixel8^2 */
  312. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  313. /* mm1 = x1 + x2 + x3 + x4 */
  314. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  315. /* mm5 = x5 + x6 + x7 + x8 */
  316. movq mm7, mm3 /* mm7 = mm3 */
  317. psrlw mm7, 7 /* mm7 = 0001000100010001 */
  318. movq mm2, mm0 /* make copy of sum1 */
  319. movq mm6, mm4 /* make copy of sum2 */
  320. paddw mm0, mm7 /* (sum1 + 1) */
  321. paddw mm4, mm7 /* (sum2 + 1) */
  322. psraw mm2, 1 /* sum1 /2 */
  323. psraw mm6, 1 /* sum2 /2 */
  324. psraw mm0, 1 /* (sum1 + 1)/2 */
  325. psraw mm4, 1 /* (sum2 + 1)/2 */
  326. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  327. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  328. psubw mm1, mm2 /* Variance 1 */
  329. psubw mm5, mm6 /* Variance 2 */
  330. movq mm7, FLimitMmx /* mm7 = FLimit */
  331. movq mm2, mm1 /* copy of Varinace 1*/
  332. movq mm6, mm5 /* Variance 2 */
  333. movq [Variance11], mm1 /* Save Variance1 */
  334. movq [Variance21], mm5 /* Save Variance2 */
  335. psubw mm1, mm7 /* Variance 1 < Flimit? */
  336. psubw mm5, mm7 /* Variance 2 < Flimit? */
  337. psraw mm2, 15 /* Variance 1 > 32768? */
  338. psraw mm6, 15 /* Vaiance 2 > 32768? */
  339. psraw mm1, 15 /* FFFF/0000 for true/false */
  340. psraw mm5, 15 /* FFFF/0000 for true/false */
  341. movq mm7, [edi+64] /* mm0 = Pixel 4 */
  342. pandn mm2, mm1 /* Variance1<32678 &&
  343. Variance1<Limit */
  344. pandn mm6, mm5 /* Variance2<32678 &&
  345. Variance1<Limit */
  346. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  347. pand mm6, mm2 /* mm6 = Variance1 < Flimit */
  348. /* &&Variance2 < Flimit */
  349. movq mm2, mm7 /* make copy of Pixel4 */
  350. psubusw mm7, mm4 /* 4 - 5 */
  351. psubusw mm4, mm2 /* 5 - 4 */
  352. por mm7, mm4 /* abs(4 - 5) */
  353. psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
  354. psraw mm7, 15 /* FFFF/0000 for True/Flase */
  355. pand mm7, mm6
  356. /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  357. /* now lets look at the right four colomn */
  358. add edi, 8 /* offset 8 to right 4 cols */
  359. movq mm2, [edi+16] /* Pixel 1 */
  360. movq mm6, [edi+80] /* Pixel 5 */
  361. psubw mm2, mm3 /* mm2 -=128 */
  362. psubw mm6, mm3 /* mm6 -=128 */
  363. movq mm0, mm2 /* mm0 = pixel 1 */
  364. movq mm4, mm6 /* mm4 = pixel 5 */
  365. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  366. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  367. movq mm1, mm2 /* mm1 = pixel1^2 */
  368. movq mm5, mm6 /* mm5 = pixel5^2 */
  369. movq mm2, [edi+32] /* Pixel 2 */
  370. movq mm6, [edi+96] /* Pixel 6 */
  371. psubw mm2, mm3 /* mm2 -=128 */
  372. psubw mm6, mm3 /* mm6 -=128 */
  373. paddw mm0, mm2 /* mm0 += pixel 2 */
  374. paddw mm4, mm6 /* mm4 += pixel 6 */
  375. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  376. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  377. paddw mm1, mm2 /* mm1 += pixel2^2 */
  378. paddw mm5, mm6 /* mm5 += pixel6^2 */
  379. movq mm2, [edi+48] /* Pixel 3 */
  380. movq mm6, [edi+112] /* Pixel 7 */
  381. psubw mm2, mm3 /* mm2 -=128 */
  382. psubw mm6, mm3 /* mm6 -=128 */
  383. paddw mm0, mm2 /* mm0 += pixel 3 */
  384. paddw mm4, mm6 /* mm4 += pixel 7 */
  385. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  386. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  387. paddw mm1, mm2 /* mm1 += pixel3^2 */
  388. paddw mm5, mm6 /* mm5 += pixel7^2 */
  389. movq mm2, [edi+64] /* Pixel 4 */
  390. movq mm6, [edi+128] /* Pixel 8 */
  391. psubw mm2, mm3 /* mm2 -=128 */
  392. psubw mm6, mm3 /* mm6 -=128 */
  393. paddw mm0, mm2 /* mm0 += pixel 4 */
  394. paddw mm4, mm6 /* mm4 += pixel 8 */
  395. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  396. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  397. paddw mm1, mm2 /* mm1 = pixel4^2 */
  398. paddw mm5, mm6 /* mm5 = pixel8^2 */
  399. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  400. /* mm1 = x1 + x2 + x3 + x4 */
  401. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  402. /* mm5 = x5 + x6 + x7 + x8 */
  403. psrlw mm3, 7 /* mm3 = 0001000100010001 */
  404. movq mm2, mm0 /* make copy of sum1 */
  405. movq mm6, mm4 /* make copy of sum2 */
  406. paddw mm0, mm3 /* (sum1 + 1) */
  407. paddw mm4, mm3 /* (sum2 + 1) */
  408. psraw mm2, 1 /* sum1 /2 */
  409. psraw mm6, 1 /* sum2 /2 */
  410. psraw mm0, 1 /* (sum1 + 1)/2 */
  411. psraw mm4, 1 /* (sum2 + 1)/2 */
  412. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  413. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  414. psubw mm1, mm2 /* Variance 1 */
  415. psubw mm5, mm6 /* Variance 2 */
  416. movq [Variance12], mm1 /* Save Variance1 */
  417. movq [Variance22], mm5 /* Save Variance2 */
  418. movq mm3, FLimitMmx /* mm3 = FLimit */
  419. movq mm2, mm1 /* copy of Varinace 1*/
  420. movq mm6, mm5 /* Variance 2 */
  421. psubw mm1, mm3 /* Variance 1 < Flimit? */
  422. psubw mm5, mm3 /* Variance 2 < Flimit? */
  423. psraw mm2, 15 /* Variance 1 > 32768? */
  424. psraw mm6, 15 /* Vaiance 2 > 32768? */
  425. psraw mm1, 15 /* FFFF/0000 for true/false */
  426. psraw mm5, 15 /* FFFF/0000 for true/false */
  427. movq mm0, [edi+64] /* mm0 = Pixel 4 */
  428. pandn mm2, mm1 /* Variance1<32678 &&
  429. Variance1<Limit */
  430. pandn mm6, mm5 /* Variance2<32678 &&
  431. Variance1<Limit */
  432. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  433. pand mm6, mm2 /* mm6 = Variance1 < Flimit */
  434. /* &&Variance2 < Flimit */
  435. movq mm2, mm0 /* make copy of Pixel4 */
  436. psubusw mm0, mm4 /* 4 - 5 */
  437. psubusw mm4, mm2 /* 5 - 4 */
  438. por mm0, mm4 /* abs(4 - 5) */
  439. psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
  440. psraw mm0, 15 /* FFFF/0000 for True/False */
  441. pand mm0, mm6
  442. sub edi, 8 /* offset edi back */
  443. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  444. /* mm0 and mm7 now are in use */
  445. /* Let's do the filtering now */
  446. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  447. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  448. movq mm5, [edi] /* mm5 = -5 */
  449. movq mm4, [edi + 16] /* mm4 = -4 */
  450. movq mm3, mm4 /* copy of -4 */
  451. movq mm6, mm5 /* copy of -5 */
  452. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  453. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  454. por mm4, mm5 /* abs([-4]-[-5] ) */
  455. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  456. psraw mm4, 15 /* FFFF/0000 for True/False */
  457. movq mm1, mm4 /* copy of the mm4 */
  458. pand mm4, mm6 /* */
  459. pandn mm1, mm3 /* */
  460. por mm1, mm4 /* mm1 = p1 */
  461. /* now find P2 */
  462. movq mm4, [edi+128] /* mm4 = [3] */
  463. movq mm5, [edi+144] /* mm5 = [4] */
  464. movq mm3, mm4 /* copy of 3 */
  465. movq mm6, mm5 /* copy of 4 */
  466. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  467. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  468. por mm4, mm5 /* abs([3]-[4] ) */
  469. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  470. psraw mm4, 15 /* FFFF/0000 for True/False */
  471. movq mm2, mm4 /* copy of the mm4 */
  472. pand mm4, mm6 /* */
  473. pandn mm2, mm3 /* */
  474. por mm2, mm4 /* mm2 = p2 */
  475. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  476. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  477. /* Des[-w4] = Src[-w4]; */
  478. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  479. movq mm3, mm1 /* mm3 = p1 */
  480. paddw mm3, mm3 /* mm3 = p1 + p1 */
  481. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  482. movq mm4, [edi+16] /* mm4 = x1 */
  483. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  484. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  485. paddw mm3, [edi+64] /* mm3 += x4 */
  486. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  487. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  488. movq mm4, mm3 /* mm4 = mm3 */
  489. movq mm5, [edi+16] /* mm5 = x1 */
  490. paddw mm4, mm5 /* mm4 = sum+x1 */
  491. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  492. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  493. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  494. psraw mm4, 4 /* mm4 >>=4 */
  495. psubw mm4, mm5 /* New Value - old Value */
  496. pand mm4, mm7 /* And the flag */
  497. paddw mm4, mm5 /* add the old value back */
  498. movq [esi], mm4 /* Write new x1 */
  499. /* sum += x5 -p1 */
  500. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  501. movq mm5, [edi+32] /* mm5= x2 */
  502. psubw mm3, mm1 /* sum=sum-p1 */
  503. paddw mm3, [edi+80] /* sum=sum+x5 */
  504. movq mm4, mm5 /* copy sum */
  505. paddw mm4, mm3 /* mm4=sum+x2 */
  506. paddw mm4, mm4 /* mm4 <<= 1 */
  507. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  508. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  509. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  510. psubw mm4, mm5 /* new value - old value */
  511. pand mm4, mm7 /* And the flag */
  512. paddw mm4, mm5 /* add the old value back */
  513. movq [esi+16], mm4 /* write new x2 */
  514. /* sum += x6 - p1 */
  515. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  516. movq mm5, [edi+48] /* mm5= x3 */
  517. psubw mm3, mm1 /* sum=sum-p1 */
  518. paddw mm3, [edi+96] /* sum=sum+x6 */
  519. movq mm4, mm5 /* copy x3 */
  520. paddw mm4, mm3 /* mm4=sum+x3 */
  521. paddw mm4, mm4 /* mm4 <<= 1 */
  522. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  523. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  524. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  525. psubw mm4, mm5 /* new value - old value */
  526. pand mm4, mm7 /* And the flag */
  527. paddw mm4, mm5 /* add the old value back */
  528. movq [esi+32], mm4 /* write new x3 */
  529. /* sum += x7 - p1 */
  530. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  531. movq mm5, [edi+64] /* mm5 = x4 */
  532. psubw mm3, mm1 /* sum = sum-p1 */
  533. paddw mm3, [edi+112] /* sum = sum+x7 */
  534. movq mm4, mm5 /* mm4 = x4 */
  535. paddw mm4, mm3 /* mm4 = sum + x4 */
  536. paddw mm4, mm4 /* mm4 *=2 */
  537. paddw mm4, mm1 /* += p1 */
  538. psubw mm4, [edi+16] /* -= x1 */
  539. psubw mm4, [edi+112] /* -= x7 */
  540. paddw mm4, [edi+128] /* += x8 */
  541. psraw mm4, 4 /* >>=4 */
  542. psubw mm4, mm5 /* -=x4 */
  543. pand mm4, mm7 /* and flag */
  544. paddw mm4, mm5 /* += x4 */
  545. movq [esi+48], mm4 /* write new x4 */
  546. /* sum+= x8-x1 */
  547. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  548. movq mm5, [edi+80] /* mm5 = x5 */
  549. psubw mm3, [edi+16] /* sum -= x1 */
  550. paddw mm3, [edi+128] /* sub += x8 */
  551. movq mm4, mm5 /* mm4 = x5 */
  552. paddw mm4, mm3 /* mm4= sum+x5 */
  553. paddw mm4, mm4 /* mm4 *= 2 */
  554. paddw mm4, [edi+16] /* += x1 */
  555. psubw mm4, [edi+32] /* -= x2 */
  556. psubw mm4, [edi+128] /* -= x8 */
  557. paddw mm4, mm2 /* += p2 */
  558. psraw mm4, 4 /* >>=4 */
  559. psubw mm4, mm5 /* -=x5 */
  560. pand mm4, mm7 /* and flag */
  561. paddw mm4, mm5 /* += x5 */
  562. movq [esi+64], mm4 /* write new x5 */
  563. /* sum += p2 - x2 */
  564. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  565. movq mm5, [edi+96] /* mm5 = x6 */
  566. psubw mm3, [edi+32] /* -= x2 */
  567. paddw mm3, mm2 /* += p2 */
  568. movq mm4, mm5 /* mm4 = x6 */
  569. paddw mm4, mm3 /* mm4 = sum+x6 */
  570. paddw mm4, mm4 /* mm4 *= 2*/
  571. paddw mm4, [edi+32] /* +=x2 */
  572. psubw mm4, [edi+48] /* -=x3 */
  573. psraw mm4, 4 /* >>=4 */
  574. psubw mm4, mm5 /* -=x6 */
  575. pand mm4, mm7 /* and flag */
  576. paddw mm4, mm5 /* += x6 */
  577. movq [esi+80], mm4 /* write new x6 */
  578. /* sum += p2 - x3 */
  579. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  580. movq mm5, [edi+112] /* mm5 = x7 */
  581. psubw mm3, [edi+48] /* -= x3 */
  582. paddw mm3, mm2 /* += p2 */
  583. movq mm4, mm5 /* mm4 = x7 */
  584. paddw mm4, mm3 /* mm4 = sum+x7 */
  585. paddw mm4, mm4 /* mm4 *= 2*/
  586. paddw mm4, [edi+48] /* +=x3 */
  587. psubw mm4, [edi+64] /* -=x4 */
  588. psraw mm4, 4 /* >>=4 */
  589. psubw mm4, mm5 /* -=x7 */
  590. pand mm4, mm7 /* and flag */
  591. paddw mm4, mm5 /* += x7 */
  592. movq [esi+96], mm4 /* write new x7 */
  593. /* sum += p2 - x4 */
  594. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  595. movq mm5, [edi+128] /* mm5 = x8 */
  596. psubw mm3, [edi+64] /* -= x4 */
  597. paddw mm3, mm2 /* += p2 */
  598. movq mm4, mm5 /* mm4 = x8 */
  599. paddw mm4, mm3 /* mm4 = sum+x8 */
  600. paddw mm4, mm4 /* mm4 *= 2*/
  601. paddw mm4, [edi+64] /* +=x4 */
  602. psubw mm4, [edi+80] /* -=x5 */
  603. psraw mm4, 4 /* >>=4 */
  604. psubw mm4, mm5 /* -=x8 */
  605. pand mm4, mm7 /* and flag */
  606. paddw mm4, mm5 /* += x8 */
  607. movq [esi+112], mm4 /* write new x8 */
  608. /* done with left four columns */
  609. /* now do the righ four columns */
  610. add edi, 8 /* shift to right four column */
  611. add esi, 8 /* shift to right four column */
  612. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  613. /* mm0 now are in use */
  614. /* Let's do the filtering now */
  615. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  616. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  617. movq mm5, [edi] /* mm5 = -5 */
  618. movq mm4, [edi + 16] /* mm4 = -4 */
  619. movq mm3, mm4 /* copy of -4 */
  620. movq mm6, mm5 /* copy of -5 */
  621. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  622. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  623. por mm4, mm5 /* abs([-4]-[-5] ) */
  624. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  625. psraw mm4, 15 /* FFFF/0000 for True/False */
  626. movq mm1, mm4 /* copy of the mm4 */
  627. pand mm4, mm6 /* */
  628. pandn mm1, mm3 /* */
  629. por mm1, mm4 /* mm1 = p1 */
  630. /* now find P2 */
  631. movq mm4, [edi+128] /* mm4 = [3] */
  632. movq mm5, [edi+144] /* mm5 = [4] */
  633. movq mm3, mm4 /* copy of 3 */
  634. movq mm6, mm5 /* copy of 4 */
  635. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  636. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  637. por mm4, mm5 /* abs([3]-[4] ) */
  638. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  639. psraw mm4, 15 /* FFFF/0000 for True/False */
  640. movq mm2, mm4 /* copy of the mm4 */
  641. pand mm4, mm6 /* */
  642. pandn mm2, mm3 /* */
  643. por mm2, mm4 /* mm2 = p2 */
  644. /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
  645. /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
  646. /* Des[-w4]=Src[-w4]; */
  647. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  648. movq mm3, mm1 /* mm3 = p1 */
  649. paddw mm3, mm3 /* mm3 = p1 + p1 */
  650. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  651. movq mm4, [edi+16] /* mm4 = x1 */
  652. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  653. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  654. paddw mm3, [edi+64] /* mm3 += x4 */
  655. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  656. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  657. movq mm4, mm3 /* mm4 = mm3 */
  658. movq mm5, [edi+16] /* mm5 = x1 */
  659. paddw mm4, mm5 /* mm4 = sum+x1 */
  660. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  661. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  662. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  663. psraw mm4, 4 /* mm4 >>=4 */
  664. psubw mm4, mm5 /* New Value - old Value */
  665. pand mm4, mm0 /* And the flag */
  666. paddw mm4, mm5 /* add the old value back */
  667. movq [esi], mm4 /* Write new x1 */
  668. /* sum += x5 -p1 */
  669. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  670. movq mm5, [edi+32] /* mm5= x2 */
  671. psubw mm3, mm1 /* sum=sum-p1 */
  672. paddw mm3, [edi+80] /* sum=sum+x5 */
  673. movq mm4, mm5 /* copy sum */
  674. paddw mm4, mm3 /* mm4=sum+x2 */
  675. paddw mm4, mm4 /* mm4 <<= 1 */
  676. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  677. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  678. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  679. psubw mm4, mm5 /* new value - old value */
  680. pand mm4, mm0 /* And the flag */
  681. paddw mm4, mm5 /* add the old value back */
  682. movq [esi+16], mm4 /* write new x2 */
  683. /* sum += x6 - p1 */
  684. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  685. movq mm5, [edi+48] /* mm5= x3 */
  686. psubw mm3, mm1 /* sum=sum-p1 */
  687. paddw mm3, [edi+96] /* sum=sum+x6 */
  688. movq mm4, mm5 /* copy x3 */
  689. paddw mm4, mm3 /* mm4=sum+x3 */
  690. paddw mm4, mm4 /* mm4 <<= 1 */
  691. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  692. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  693. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  694. psubw mm4, mm5 /* new value - old value */
  695. pand mm4, mm0 /* And the flag */
  696. paddw mm4, mm5 /* add the old value back */
  697. movq [esi+32], mm4 /* write new x3 */
  698. /* sum += x7 - p1 */
  699. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  700. movq mm5, [edi+64] /* mm5 = x4 */
  701. psubw mm3, mm1 /* sum = sum-p1 */
  702. paddw mm3, [edi+112] /* sum = sum+x7 */
  703. movq mm4, mm5 /* mm4 = x4 */
  704. paddw mm4, mm3 /* mm4 = sum + x4 */
  705. paddw mm4, mm4 /* mm4 *=2 */
  706. paddw mm4, mm1 /* += p1 */
  707. psubw mm4, [edi+16] /* -= x1 */
  708. psubw mm4, [edi+112] /* -= x7 */
  709. paddw mm4, [edi+128] /* += x8 */
  710. psraw mm4, 4 /* >>=4 */
  711. psubw mm4, mm5 /* -=x4 */
  712. pand mm4, mm0 /* and flag */
  713. paddw mm4, mm5 /* += x4 */
  714. movq [esi+48], mm4 /* write new x4 */
  715. /* sum+= x8-x1 */
  716. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  717. movq mm5, [edi+80] /* mm5 = x5 */
  718. psubw mm3, [edi+16] /* sum -= x1 */
  719. paddw mm3, [edi+128] /* sub += x8 */
  720. movq mm4, mm5 /* mm4 = x5 */
  721. paddw mm4, mm3 /* mm4= sum+x5 */
  722. paddw mm4, mm4 /* mm4 *= 2 */
  723. paddw mm4, [edi+16] /* += x1 */
  724. psubw mm4, [edi+32] /* -= x2 */
  725. psubw mm4, [edi+128] /* -= x8 */
  726. paddw mm4, mm2 /* += p2 */
  727. psraw mm4, 4 /* >>=4 */
  728. psubw mm4, mm5 /* -=x5 */
  729. pand mm4, mm0 /* and flag */
  730. paddw mm4, mm5 /* += x5 */
  731. movq [esi+64], mm4 /* write new x5 */
  732. /* sum += p2 - x2 */
  733. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  734. movq mm5, [edi+96] /* mm5 = x6 */
  735. psubw mm3, [edi+32] /* -= x2 */
  736. paddw mm3, mm2 /* += p2 */
  737. movq mm4, mm5 /* mm4 = x6 */
  738. paddw mm4, mm3 /* mm4 = sum+x6 */
  739. paddw mm4, mm4 /* mm4 *= 2*/
  740. paddw mm4, [edi+32] /* +=x2 */
  741. psubw mm4, [edi+48] /* -=x3 */
  742. psraw mm4, 4 /* >>=4 */
  743. psubw mm4, mm5 /* -=x6 */
  744. pand mm4, mm0 /* and flag */
  745. paddw mm4, mm5 /* += x6 */
  746. movq [esi+80], mm4 /* write new x6 */
  747. /* sum += p2 - x3 */
  748. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  749. movq mm5, [edi+112] /* mm5 = x7 */
  750. psubw mm3, [edi+48] /* -= x3 */
  751. paddw mm3, mm2 /* += p2 */
  752. movq mm4, mm5 /* mm4 = x7 */
  753. paddw mm4, mm3 /* mm4 = sum+x7 */
  754. paddw mm4, mm4 /* mm4 *= 2*/
  755. paddw mm4, [edi+48] /* +=x3 */
  756. psubw mm4, [edi+64] /* -=x4 */
  757. psraw mm4, 4 /* >>=4 */
  758. psubw mm4, mm5 /* -=x7 */
  759. pand mm4, mm0 /* and flag */
  760. paddw mm4, mm5 /* += x7 */
  761. movq [esi+96], mm4 /* write new x7 */
  762. /* sum += p2 - x4 */
  763. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  764. movq mm5, [edi+128] /* mm5 = x8 */
  765. psubw mm3, [edi+64] /* -= x4 */
  766. paddw mm3, mm2 /* += p2 */
  767. movq mm4, mm5 /* mm4 = x8 */
  768. paddw mm4, mm3 /* mm4 = sum+x8 */
  769. paddw mm4, mm4 /* mm4 *= 2*/
  770. paddw mm4, [edi+64] /* +=x4 */
  771. psubw mm4, [edi+80] /* -=x5 */
  772. psraw mm4, 4 /* >>=4 */
  773. psubw mm4, mm5 /* -=x8 */
  774. pand mm4, mm0 /* and flag */
  775. paddw mm4, mm5 /* += x8 */
  776. movq [esi+112], mm4 /* write new x8 */
  777. /* done with right four column */
  778. add edi, 8 /* shift edi to point x1 */
  779. sub esi, 8 /* shift esi back to x1 */
  780. mov ebp, Des /* the destination */
  781. lea ebp, [ebp + edx *4] /* point to des[-w4] */
  782. movq mm0, [esi]
  783. packuswb mm0, [esi + 8]
  784. movq [ebp], mm0 /* write des[-w4] */
  785. movq mm1, [esi + 16]
  786. packuswb mm1, [esi + 24]
  787. movq [ebp+ecx ], mm1 /* write des[-w3] */
  788. movq mm2, [esi + 32]
  789. packuswb mm2, [esi + 40]
  790. movq [ebp+ecx*2 ], mm2 /* write des[-w2] */
  791. movq mm3, [esi + 48]
  792. packuswb mm3, [esi + 56]
  793. lea ebp, [ebp+ecx*4] /* point to des[0] */
  794. movq [ebp+edx], mm3 /* write des[-w1] */
  795. movq mm0, [esi + 64]
  796. packuswb mm0, [esi + 72]
  797. movq [ebp ], mm0 /* write des[0] */
  798. movq mm1, [esi + 80]
  799. packuswb mm1, [esi + 88]
  800. movq [ebp+ecx], mm1 /* write des[w1] */
  801. movq mm2, [esi + 96]
  802. packuswb mm2, [esi + 104]
  803. movq [ebp+ecx*2], mm2 /* write des[w2] */
  804. movq mm3, [esi + 112]
  805. packuswb mm3, [esi + 120]
  806. lea ebp, [ebp+ecx*2] /* point to des[w4] */
  807. movq [ebp+ecx], mm3 /* write des[w3] */
  808. pop edi
  809. pop esi
  810. pop edx
  811. pop ecx
  812. pop ebp
  813. pop eax
  814. } /* end of the macro */
  815. Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
  816. Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
  817. pbi->FragmentVariances[CurrentFrag] += Var1;
  818. Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
  819. Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
  820. pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
  821. }
  822. else
  823. {
  824. /* copy from src to des */
  825. __asm
  826. {
  827. push esi
  828. push edi
  829. push ecx
  830. mov esi, Src /* esi = Src */
  831. mov edi, Des /* edi = Des */
  832. push edx
  833. mov ecx, PlaneLineStep /* ecx = Pitch */
  834. xor edx, edx /* clear edx */
  835. sub edx, ecx /* edx = -Pitch */
  836. lea esi, [esi+edx*4] /* esi=Src-4*Pitch*/
  837. movq mm0, [esi] /* first row */
  838. movq [edi+edx*4], mm0 /* write first row */
  839. lea edi, [edi+edx*4] /* edi=Des-4*Pitch*/
  840. movq mm1, [esi+ecx] /* Src-3*Pitch */
  841. movq [edi+ecx], mm1 /* write second row */
  842. movq mm2, [esi+ecx*2] /* Src-2*Pitch */
  843. lea esi, [esi+ecx*4] /* Src */
  844. movq [edi+ecx*2], mm2 /* write third row */
  845. lea edi, [edi+ecx*4] /* Des */
  846. movq mm3, [esi+edx] /* Src-Pitch */
  847. movq [edi+edx], mm3 /* write fourth row */
  848. movq mm4, [esi] /* Src */
  849. movq mm5, [esi+ecx] /* Src+Pitch */
  850. movq [edi], mm4 /* write fifth rwo */
  851. movq mm6, [esi+ecx*2]
  852. lea esi, [esi+ecx*4] /* Src+pitch*4 */
  853. movq [edi+ecx], mm5 /* write the sixth rwo */
  854. movq [edi+ecx*2], mm6 /* write the seventh row */
  855. movq mm7, [esi+edx]
  856. lea edi, [edi+ecx*4] /* Des+Pitch*4 */
  857. movq [edi+edx], mm7 /* write the last row */
  858. pop edx
  859. pop ecx
  860. pop edi
  861. pop esi
  862. }
  863. }
  864. Src += 8;
  865. Des += 8;
  866. CurrentFrag ++;
  867. }
  868. Des -= ((PlaneLineStep + FragAcross)<<3);
  869. Des += 8;
  870. Src = Des;
  871. CurrentFrag = StartFrag ;
  872. while(CurrentFrag < StartFrag + FragAcross - 1)
  873. {
  874. QStep = QuantScale[pbi->FragQIndex[CurrentFrag+1]];
  875. if( QStep > 3 )
  876. {
  877. QStepMmx[0] = (INT16)QStep;
  878. QStepMmx[1] = (INT16)QStep;
  879. QStepMmx[2] = (INT16)QStep;
  880. QStepMmx[3] = (INT16)QStep;
  881. for( j=0; j<8;j++)
  882. {
  883. Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
  884. Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
  885. }
  886. __asm
  887. {
  888. /* Save the registers */
  889. push eax
  890. push ebp
  891. push ecx
  892. push edx
  893. push esi
  894. push edi
  895. /* Calculate the FLimit and store FLimit and QStep */
  896. movq mm0, QStepMmx /* mm0 = QStep */
  897. movq mm1, FourThrees /* mm1 = 03030303 */
  898. pmullw mm1, mm0 /* mm1 = QStep * 3 */
  899. pmullw mm1, mm0 /* mm1 = QStep * QStep * 3 */
  900. psrlw mm1, 5 /* mm1 = FLimit */
  901. movq [FLimitMmx], mm1 /* Save FLimit */
  902. /* setup the pointers to data */
  903. mov eax, Src /* eax = Src */
  904. xor edx, edx /* clear edx */
  905. sub eax, 4 /* eax = Src-4 */
  906. lea esi, NewRows /* esi = NewRows */
  907. lea edi, Rows /* edi = Rows */
  908. mov ecx, PlaneLineStep /* ecx = Pitch */
  909. sub edx, ecx /* edx = -Pitch */
  910. /* Get the data to the intermediate buffer */
  911. movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
  912. movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
  913. movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
  914. lea eax, [eax+ecx*4] /* Go down four Rows */
  915. movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
  916. movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
  917. punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
  918. punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
  919. movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
  920. punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
  921. punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
  922. movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
  923. punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
  924. punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
  925. movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
  926. punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
  927. punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
  928. pxor mm7, mm7 /* clear mm7 */
  929. movq mm5, mm0 /* make a copy */
  930. punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
  931. movq [edi+16], mm0 /* write 00 10 20 30 */
  932. punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
  933. movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
  934. movq [edi+32], mm5 /* write 01 11 21 31 */
  935. punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
  936. punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
  937. movq [edi+48], mm1 /* write 02 12 22 32 */
  938. movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
  939. movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
  940. movq [edi+64], mm0 /* write 03 13 23 33 */
  941. punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
  942. punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
  943. movq [edi+80], mm2 /* write 04 14 24 34 */
  944. punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
  945. punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
  946. movq [edi+96], mm3 /* write 05 15 25 35 */
  947. movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
  948. movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
  949. movq [edi+112], mm4 /* write 06 16 26 37 */
  950. movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
  951. lea eax, [eax+ ecx*4] /* Go down four rows */
  952. movq [edi+128], mm5 /* write 07 17 27 37 */
  953. movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
  954. movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
  955. punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
  956. punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
  957. movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
  958. punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
  959. punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
  960. movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
  961. punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
  962. punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
  963. movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
  964. punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
  965. punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
  966. movq mm5, mm0 /* make a copy */
  967. punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
  968. movq [edi+24], mm0 /* write 40 50 60 70 */
  969. punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
  970. movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
  971. movq [edi+40], mm5 /* write 41 51 61 71 */
  972. punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
  973. punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
  974. movq [edi+56], mm1 /* write 42 52 62 72 */
  975. movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
  976. movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
  977. movq [edi+72], mm0 /* write 43 53 63 73 */
  978. punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
  979. punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
  980. movq [edi+88], mm2 /* write 44 54 64 74 */
  981. punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
  982. punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
  983. movq [edi+104], mm3 /* write 45 55 65 75 */
  984. movq [edi+120], mm4 /* write 46 56 66 76 */
  985. movq [edi+136], mm5 /* write 47 57 67 77 */
  986. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  987. /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
  988. /* mm7 = 0, mm3 = {128, 128, 128, 128} */
  989. pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
  990. psllw mm3, 15 /* mm3 = 8000800080008000 */
  991. psrlw mm3, 8 /* mm3 = 0080008000800080 */
  992. movq mm2, [edi+16] /* Pixel 1 */
  993. movq mm6, [edi+80] /* Pixel 5 */
  994. psubw mm2, mm3 /* mm2 -=128 */
  995. psubw mm6, mm3 /* mm6 -=128 */
  996. movq mm0, mm2 /* mm0 = pixel 1 */
  997. movq mm4, mm6 /* mm4 = pixel 5 */
  998. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  999. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  1000. movq mm1, mm2 /* mm1 = pixel1^2 */
  1001. movq mm5, mm6 /* mm5 = pixel5^2 */
  1002. movq mm2, [edi+32] /* Pixel 2 */
  1003. movq mm6, [edi+96] /* Pixel 6 */
  1004. psubw mm2, mm3 /* mm2 -=128 */
  1005. psubw mm6, mm3 /* mm6 -=128 */
  1006. paddw mm0, mm2 /* mm0 += pixel 2 */
  1007. paddw mm4, mm6 /* mm4 += pixel 6 */
  1008. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  1009. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  1010. paddw mm1, mm2 /* mm1 += pixel2^2 */
  1011. paddw mm5, mm6 /* mm5 += pixel6^2 */
  1012. movq mm2, [edi+48] /* Pixel 3 */
  1013. movq mm6, [edi+112] /* Pixel 7 */
  1014. psubw mm2, mm3 /* mm2 -=128 */
  1015. psubw mm6, mm3 /* mm6 -=128 */
  1016. paddw mm0, mm2 /* mm0 += pixel 3 */
  1017. paddw mm4, mm6 /* mm4 += pixel 7 */
  1018. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  1019. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  1020. paddw mm1, mm2 /* mm1 += pixel3^2 */
  1021. paddw mm5, mm6 /* mm5 += pixel7^2 */
  1022. movq mm2, [edi+64] /* Pixel 4 */
  1023. movq mm6, [edi+128] /* Pixel 8 */
  1024. psubw mm2, mm3 /* mm2 -=128 */
  1025. psubw mm6, mm3 /* mm6 -=128 */
  1026. paddw mm0, mm2 /* mm0 += pixel 4 */
  1027. paddw mm4, mm6 /* mm4 += pixel 8 */
  1028. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  1029. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  1030. paddw mm1, mm2 /* mm1 = pixel4^2 */
  1031. paddw mm5, mm6 /* mm5 = pixel8^2 */
  1032. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  1033. /* mm1 = x1 + x2 + x3 + x4 */
  1034. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  1035. /* mm5 = x5 + x6 + x7 + x8 */
  1036. movq mm7, mm3 /* mm7 = mm3 */
  1037. psrlw mm7, 7 /* mm7 = 0001000100010001 */
  1038. movq mm2, mm0 /* make copy of sum1 */
  1039. movq mm6, mm4 /* make copy of sum2 */
  1040. paddw mm0, mm7 /* (sum1 + 1) */
  1041. paddw mm4, mm7 /* (sum2 + 1) */
  1042. psraw mm2, 1 /* sum1 /2 */
  1043. psraw mm6, 1 /* sum2 /2 */
  1044. psraw mm0, 1 /* (sum1 + 1)/2 */
  1045. psraw mm4, 1 /* (sum2 + 1)/2 */
  1046. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  1047. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  1048. psubw mm1, mm2 /* Variance 1 */
  1049. psubw mm5, mm6 /* Variance 2 */
  1050. movq [Variance11], mm1 /* Save Variance1 */
  1051. movq [Variance21], mm5 /* Save Variance2 */
  1052. movq mm7, FLimitMmx /* mm7 = FLimit */
  1053. movq mm2, mm1 /* copy of Variance 1*/
  1054. movq mm6, mm5 /* copy of Variance 2*/
  1055. psubw mm1, mm7 /* Variance 1 < Flimit? */
  1056. psubw mm5, mm7 /* Variance 2 < Flimit? */
  1057. psraw mm1, 15 /* FFFF/0000 for true/false */
  1058. psraw mm5, 15 /* FFFF/0000 for true/false */
  1059. psraw mm2, 15 /* Variance 1 > 32768 ? */
  1060. psraw mm6, 15 /* Variance 2 > 32768 ? */
  1061. movq mm7, [edi+64] /* mm0 = Pixel 4 */
  1062. pandn mm2, mm1 /* Variance 1 < Flimit &&
  1063. Variance 1 < 32768 */
  1064. pandn mm6, mm5 /* Variance 2 < Flimit &&
  1065. Variance 2 < 32768 */
  1066. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  1067. pand mm6, mm2 /* mm1 = Variance1 < Flimit */
  1068. /* &&Variance2 < Flimit */
  1069. movq mm2, mm7 /* make copy of Pixel4 */
  1070. psubusw mm7, mm4 /* 4 - 5 */
  1071. psubusw mm4, mm2 /* 5 - 4 */
  1072. por mm7, mm4 /* abs(4 - 5) */
  1073. psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
  1074. psraw mm7, 15 /* FFFF/0000 for True/Flase */
  1075. pand mm7, mm6
  1076. /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1077. /* now lets look at the right four colomn */
  1078. add edi, 8 /* offset 8 to right 4 cols */
  1079. movq mm2, [edi+16] /* Pixel 1 */
  1080. movq mm6, [edi+80] /* Pixel 5 */
  1081. psubw mm2, mm3 /* mm2 -=128 */
  1082. psubw mm6, mm3 /* mm6 -=128 */
  1083. movq mm0, mm2 /* mm0 = pixel 1 */
  1084. movq mm4, mm6 /* mm4 = pixel 5 */
  1085. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  1086. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  1087. movq mm1, mm2 /* mm1 = pixel1^2 */
  1088. movq mm5, mm6 /* mm5 = pixel5^2 */
  1089. movq mm2, [edi+32] /* Pixel 2 */
  1090. movq mm6, [edi+96] /* Pixel 6 */
  1091. psubw mm2, mm3 /* mm2 -=128 */
  1092. psubw mm6, mm3 /* mm6 -=128 */
  1093. paddw mm0, mm2 /* mm0 += pixel 2 */
  1094. paddw mm4, mm6 /* mm4 += pixel 6 */
  1095. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  1096. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  1097. paddw mm1, mm2 /* mm1 += pixel2^2 */
  1098. paddw mm5, mm6 /* mm5 += pixel6^2 */
  1099. movq mm2, [edi+48] /* Pixel 3 */
  1100. movq mm6, [edi+112] /* Pixel 7 */
  1101. psubw mm2, mm3 /* mm2 -=128 */
  1102. psubw mm6, mm3 /* mm6 -=128 */
  1103. paddw mm0, mm2 /* mm0 += pixel 3 */
  1104. paddw mm4, mm6 /* mm4 += pixel 7 */
  1105. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  1106. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  1107. paddw mm1, mm2 /* mm1 += pixel3^2 */
  1108. paddw mm5, mm6 /* mm5 += pixel7^2 */
  1109. movq mm2, [edi+64] /* Pixel 4 */
  1110. movq mm6, [edi+128] /* Pixel 8 */
  1111. psubw mm2, mm3 /* mm2 -=128 */
  1112. psubw mm6, mm3 /* mm6 -=128 */
  1113. paddw mm0, mm2 /* mm0 += pixel 4 */
  1114. paddw mm4, mm6 /* mm4 += pixel 8 */
  1115. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  1116. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  1117. paddw mm1, mm2 /* mm1 = pixel4^2 */
  1118. paddw mm5, mm6 /* mm5 = pixel8^2 */
  1119. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  1120. /* mm1 = x1 + x2 + x3 + x4 */
  1121. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  1122. /* mm5 = x5 + x6 + x7 + x8 */
  1123. psrlw mm3, 7 /* mm3 = 0001000100010001 */
  1124. movq mm2, mm0 /* make copy of sum1 */
  1125. movq mm6, mm4 /* make copy of sum2 */
  1126. paddw mm0, mm3 /* (sum1 + 1) */
  1127. paddw mm4, mm3 /* (sum2 + 1) */
  1128. psraw mm2, 1 /* sum1 /2 */
  1129. psraw mm6, 1 /* sum2 /2 */
  1130. psraw mm0, 1 /* (sum1 + 1)/2 */
  1131. psraw mm4, 1 /* (sum2 + 1)/2 */
  1132. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  1133. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  1134. psubw mm1, mm2 /* Variance 1 */
  1135. psubw mm5, mm6 /* Variance 2 */
  1136. movq [Variance12], mm1 /* Save Variance1 */
  1137. movq [Variance22], mm5 /* Save Variance2 */
  1138. movq mm3, FLimitMmx /* mm3 = FLimit */
  1139. movq mm2, mm1 /* copy of Varinace 1*/
  1140. movq mm6, mm5 /* Variance 2 */
  1141. psubw mm1, mm3 /* Variance 1 < Flimit? */
  1142. psubw mm5, mm3 /* Variance 2 < Flimit? */
  1143. psraw mm6, 15 /* Variance 1 > 32768 */
  1144. psraw mm2, 15 /* Variance 2 > 32768 */
  1145. psraw mm1, 15 /* FFFF/0000 for true/false */
  1146. psraw mm5, 15 /* FFFF/0000 for true/false */
  1147. movq mm0, [edi+64] /* mm0 = Pixel 4 */
  1148. pandn mm2, mm1 /* Variance1<32678 &&
  1149. Variance1<Limit */
  1150. pandn mm6, mm5 /* Variance2<32678 &&
  1151. Variance1<Limit */
  1152. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  1153. pand mm6, mm2 /* mm1 = Variance1 < Flimit */
  1154. /* &&Variance2 < Flimit */
  1155. movq mm2, mm0 /* make copy of Pixel4 */
  1156. psubusw mm0, mm4 /* 4 - 5 */
  1157. psubusw mm4, mm2 /* 5 - 4 */
  1158. por mm0, mm4 /* abs(4 - 5) */
  1159. psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
  1160. psraw mm0, 15 /* FFFF/0000 for True/False */
  1161. pand mm0, mm6
  1162. sub edi, 8 /* offset edi back */
  1163. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1164. /* mm0 and mm7 now are in use */
  1165. /* Let's do the filtering now */
  1166. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  1167. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  1168. movq mm5, [edi] /* mm5 = -5 */
  1169. movq mm4, [edi + 16] /* mm4 = -4 */
  1170. movq mm3, mm4 /* copy of -4 */
  1171. movq mm6, mm5 /* copy of -5 */
  1172. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  1173. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  1174. por mm4, mm5 /* abs([-4]-[-5] ) */
  1175. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  1176. psraw mm4, 15 /* FFFF/0000 for True/False */
  1177. movq mm1, mm4 /* copy of the mm4 */
  1178. pand mm4, mm6 /* */
  1179. pandn mm1, mm3 /* */
  1180. por mm1, mm4 /* mm1 = p1 */
  1181. /* now find P2 */
  1182. movq mm4, [edi+128] /* mm4 = [3] */
  1183. movq mm5, [edi+144] /* mm5 = [4] */
  1184. movq mm3, mm4 /* copy of 3 */
  1185. movq mm6, mm5 /* copy of 4 */
  1186. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  1187. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  1188. por mm4, mm5 /* abs([3]-[4] ) */
  1189. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  1190. psraw mm4, 15 /* FFFF/0000 for True/False */
  1191. movq mm2, mm4 /* copy of the mm4 */
  1192. pand mm4, mm6 /* */
  1193. pandn mm2, mm3 /* */
  1194. por mm2, mm4 /* mm2 = p2 */
  1195. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  1196. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  1197. /* Des[-w4] = Src[-w4]; */
  1198. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  1199. movq mm3, mm1 /* mm3 = p1 */
  1200. paddw mm3, mm3 /* mm3 = p1 + p1 */
  1201. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  1202. movq mm4, [edi+16] /* mm4 = x1 */
  1203. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  1204. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  1205. paddw mm3, [edi+64] /* mm3 += x4 */
  1206. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  1207. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  1208. movq mm4, mm3 /* mm4 = mm3 */
  1209. movq mm5, [edi+16] /* mm5 = x1 */
  1210. paddw mm4, mm5 /* mm4 = sum+x1 */
  1211. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  1212. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  1213. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  1214. psraw mm4, 4 /* mm4 >>=4 */
  1215. psubw mm4, mm5 /* New Value - old Value */
  1216. pand mm4, mm7 /* And the flag */
  1217. paddw mm4, mm5 /* add the old value back */
  1218. movq [esi], mm4 /* Write new x1 */
  1219. /* sum += x5 -p1 */
  1220. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  1221. movq mm5, [edi+32] /* mm5= x2 */
  1222. psubw mm3, mm1 /* sum=sum-p1 */
  1223. paddw mm3, [edi+80] /* sum=sum+x5 */
  1224. movq mm4, mm5 /* copy sum */
  1225. paddw mm4, mm3 /* mm4=sum+x2 */
  1226. paddw mm4, mm4 /* mm4 <<= 1 */
  1227. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  1228. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  1229. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  1230. psubw mm4, mm5 /* new value - old value */
  1231. pand mm4, mm7 /* And the flag */
  1232. paddw mm4, mm5 /* add the old value back */
  1233. movq [esi+16], mm4 /* write new x2 */
  1234. /* sum += x6 - p1 */
  1235. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  1236. movq mm5, [edi+48] /* mm5= x3 */
  1237. psubw mm3, mm1 /* sum=sum-p1 */
  1238. paddw mm3, [edi+96] /* sum=sum+x6 */
  1239. movq mm4, mm5 /* copy x3 */
  1240. paddw mm4, mm3 /* mm4=sum+x3 */
  1241. paddw mm4, mm4 /* mm4 <<= 1 */
  1242. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  1243. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  1244. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  1245. psubw mm4, mm5 /* new value - old value */
  1246. pand mm4, mm7 /* And the flag */
  1247. paddw mm4, mm5 /* add the old value back */
  1248. movq [esi+32], mm4 /* write new x3 */
  1249. /* sum += x7 - p1 */
  1250. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  1251. movq mm5, [edi+64] /* mm5 = x4 */
  1252. psubw mm3, mm1 /* sum = sum-p1 */
  1253. paddw mm3, [edi+112] /* sum = sum+x7 */
  1254. movq mm4, mm5 /* mm4 = x4 */
  1255. paddw mm4, mm3 /* mm4 = sum + x4 */
  1256. paddw mm4, mm4 /* mm4 *=2 */
  1257. paddw mm4, mm1 /* += p1 */
  1258. psubw mm4, [edi+16] /* -= x1 */
  1259. psubw mm4, [edi+112] /* -= x7 */
  1260. paddw mm4, [edi+128] /* += x8 */
  1261. psraw mm4, 4 /* >>=4 */
  1262. psubw mm4, mm5 /* -=x4 */
  1263. pand mm4, mm7 /* and flag */
  1264. paddw mm4, mm5 /* += x4 */
  1265. movq [esi+48], mm4 /* write new x4 */
  1266. /* sum+= x8-x1 */
  1267. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  1268. movq mm5, [edi+80] /* mm5 = x5 */
  1269. psubw mm3, [edi+16] /* sum -= x1 */
  1270. paddw mm3, [edi+128] /* sub += x8 */
  1271. movq mm4, mm5 /* mm4 = x5 */
  1272. paddw mm4, mm3 /* mm4= sum+x5 */
  1273. paddw mm4, mm4 /* mm4 *= 2 */
  1274. paddw mm4, [edi+16] /* += x1 */
  1275. psubw mm4, [edi+32] /* -= x2 */
  1276. psubw mm4, [edi+128] /* -= x8 */
  1277. paddw mm4, mm2 /* += p2 */
  1278. psraw mm4, 4 /* >>=4 */
  1279. psubw mm4, mm5 /* -=x5 */
  1280. pand mm4, mm7 /* and flag */
  1281. paddw mm4, mm5 /* += x5 */
  1282. movq [esi+64], mm4 /* write new x5 */
  1283. /* sum += p2 - x2 */
  1284. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  1285. movq mm5, [edi+96] /* mm5 = x6 */
  1286. psubw mm3, [edi+32] /* -= x2 */
  1287. paddw mm3, mm2 /* += p2 */
  1288. movq mm4, mm5 /* mm4 = x6 */
  1289. paddw mm4, mm3 /* mm4 = sum+x6 */
  1290. paddw mm4, mm4 /* mm4 *= 2*/
  1291. paddw mm4, [edi+32] /* +=x2 */
  1292. psubw mm4, [edi+48] /* -=x3 */
  1293. psraw mm4, 4 /* >>=4 */
  1294. psubw mm4, mm5 /* -=x6 */
  1295. pand mm4, mm7 /* and flag */
  1296. paddw mm4, mm5 /* += x6 */
  1297. movq [esi+80], mm4 /* write new x6 */
  1298. /* sum += p2 - x3 */
  1299. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  1300. movq mm5, [edi+112] /* mm5 = x7 */
  1301. psubw mm3, [edi+48] /* -= x3 */
  1302. paddw mm3, mm2 /* += p2 */
  1303. movq mm4, mm5 /* mm4 = x7 */
  1304. paddw mm4, mm3 /* mm4 = sum+x7 */
  1305. paddw mm4, mm4 /* mm4 *= 2*/
  1306. paddw mm4, [edi+48] /* +=x3 */
  1307. psubw mm4, [edi+64] /* -=x4 */
  1308. psraw mm4, 4 /* >>=4 */
  1309. psubw mm4, mm5 /* -=x7 */
  1310. pand mm4, mm7 /* and flag */
  1311. paddw mm4, mm5 /* += x7 */
  1312. movq [esi+96], mm4 /* write new x7 */
  1313. /* sum += p2 - x4 */
  1314. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  1315. movq mm5, [edi+128] /* mm5 = x8 */
  1316. psubw mm3, [edi+64] /* -= x4 */
  1317. paddw mm3, mm2 /* += p2 */
  1318. movq mm4, mm5 /* mm4 = x8 */
  1319. paddw mm4, mm3 /* mm4 = sum+x8 */
  1320. paddw mm4, mm4 /* mm4 *= 2*/
  1321. paddw mm4, [edi+64] /* +=x4 */
  1322. psubw mm4, [edi+80] /* -=x5 */
  1323. psraw mm4, 4 /* >>=4 */
  1324. psubw mm4, mm5 /* -=x8 */
  1325. pand mm4, mm7 /* and flag */
  1326. paddw mm4, mm5 /* += x8 */
  1327. movq [esi+112], mm4 /* write new x8 */
  1328. /* done with left four columns */
  1329. /* now do the righ four columns */
  1330. add edi, 8 /* shift to right four column */
  1331. add esi, 8 /* shift to right four column */
  1332. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1333. /* mm0 now are in use */
  1334. /* Let's do the filtering now */
  1335. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  1336. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  1337. movq mm5, [edi] /* mm5 = -5 */
  1338. movq mm4, [edi + 16] /* mm4 = -4 */
  1339. movq mm3, mm4 /* copy of -4 */
  1340. movq mm6, mm5 /* copy of -5 */
  1341. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  1342. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  1343. por mm4, mm5 /* abs([-4]-[-5] ) */
  1344. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  1345. psraw mm4, 15 /* FFFF/0000 for True/False */
  1346. movq mm1, mm4 /* copy of the mm4 */
  1347. pand mm4, mm6 /* */
  1348. pandn mm1, mm3 /* */
  1349. por mm1, mm4 /* mm1 = p1 */
  1350. /* now find P2 */
  1351. movq mm4, [edi+128] /* mm4 = [3] */
  1352. movq mm5, [edi+144] /* mm5 = [4] */
  1353. movq mm3, mm4 /* copy of 3 */
  1354. movq mm6, mm5 /* copy of 4 */
  1355. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  1356. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  1357. por mm4, mm5 /* abs([3]-[4] ) */
  1358. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  1359. psraw mm4, 15 /* FFFF/0000 for True/False */
  1360. movq mm2, mm4 /* copy of the mm4 */
  1361. pand mm4, mm6 /* */
  1362. pandn mm2, mm3 /* */
  1363. por mm2, mm4 /* mm2 = p2 */
  1364. /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
  1365. /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
  1366. /* Des[-w4]=Src[-w4]; */
  1367. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  1368. movq mm3, mm1 /* mm3 = p1 */
  1369. paddw mm3, mm3 /* mm3 = p1 + p1 */
  1370. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  1371. movq mm4, [edi+16] /* mm4 = x1 */
  1372. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  1373. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  1374. paddw mm3, [edi+64] /* mm3 += x4 */
  1375. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  1376. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  1377. movq mm4, mm3 /* mm4 = mm3 */
  1378. movq mm5, [edi+16] /* mm5 = x1 */
  1379. paddw mm4, mm5 /* mm4 = sum+x1 */
  1380. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  1381. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  1382. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  1383. psraw mm4, 4 /* mm4 >>=4 */
  1384. psubw mm4, mm5 /* New Value - old Value */
  1385. pand mm4, mm0 /* And the flag */
  1386. paddw mm4, mm5 /* add the old value back */
  1387. movq [esi], mm4 /* Write new x1 */
  1388. /* sum += x5 -p1 */
  1389. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  1390. movq mm5, [edi+32] /* mm5= x2 */
  1391. psubw mm3, mm1 /* sum=sum-p1 */
  1392. paddw mm3, [edi+80] /* sum=sum+x5 */
  1393. movq mm4, mm5 /* copy sum */
  1394. paddw mm4, mm3 /* mm4=sum+x2 */
  1395. paddw mm4, mm4 /* mm4 <<= 1 */
  1396. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  1397. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  1398. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  1399. psubw mm4, mm5 /* new value - old value */
  1400. pand mm4, mm0 /* And the flag */
  1401. paddw mm4, mm5 /* add the old value back */
  1402. movq [esi+16], mm4 /* write new x2 */
  1403. /* sum += x6 - p1 */
  1404. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  1405. movq mm5, [edi+48] /* mm5= x3 */
  1406. psubw mm3, mm1 /* sum=sum-p1 */
  1407. paddw mm3, [edi+96] /* sum=sum+x6 */
  1408. movq mm4, mm5 /* copy x3 */
  1409. paddw mm4, mm3 /* mm4=sum+x3 */
  1410. paddw mm4, mm4 /* mm4 <<= 1 */
  1411. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  1412. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  1413. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  1414. psubw mm4, mm5 /* new value - old value */
  1415. pand mm4, mm0 /* And the flag */
  1416. paddw mm4, mm5 /* add the old value back */
  1417. movq [esi+32], mm4 /* write new x3 */
  1418. /* sum += x7 - p1 */
  1419. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  1420. movq mm5, [edi+64] /* mm5 = x4 */
  1421. psubw mm3, mm1 /* sum = sum-p1 */
  1422. paddw mm3, [edi+112] /* sum = sum+x7 */
  1423. movq mm4, mm5 /* mm4 = x4 */
  1424. paddw mm4, mm3 /* mm4 = sum + x4 */
  1425. paddw mm4, mm4 /* mm4 *=2 */
  1426. paddw mm4, mm1 /* += p1 */
  1427. psubw mm4, [edi+16] /* -= x1 */
  1428. psubw mm4, [edi+112] /* -= x7 */
  1429. paddw mm4, [edi+128] /* += x8 */
  1430. psraw mm4, 4 /* >>=4 */
  1431. psubw mm4, mm5 /* -=x4 */
  1432. pand mm4, mm0 /* and flag */
  1433. paddw mm4, mm5 /* += x4 */
  1434. movq [esi+48], mm4 /* write new x4 */
  1435. /* sum+= x8-x1 */
  1436. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  1437. movq mm5, [edi+80] /* mm5 = x5 */
  1438. psubw mm3, [edi+16] /* sum -= x1 */
  1439. paddw mm3, [edi+128] /* sub += x8 */
  1440. movq mm4, mm5 /* mm4 = x5 */
  1441. paddw mm4, mm3 /* mm4= sum+x5 */
  1442. paddw mm4, mm4 /* mm4 *= 2 */
  1443. paddw mm4, [edi+16] /* += x1 */
  1444. psubw mm4, [edi+32] /* -= x2 */
  1445. psubw mm4, [edi+128] /* -= x8 */
  1446. paddw mm4, mm2 /* += p2 */
  1447. psraw mm4, 4 /* >>=4 */
  1448. psubw mm4, mm5 /* -=x5 */
  1449. pand mm4, mm0 /* and flag */
  1450. paddw mm4, mm5 /* += x5 */
  1451. movq [esi+64], mm4 /* write new x5 */
  1452. /* sum += p2 - x2 */
  1453. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  1454. movq mm5, [edi+96] /* mm5 = x6 */
  1455. psubw mm3, [edi+32] /* -= x2 */
  1456. paddw mm3, mm2 /* += p2 */
  1457. movq mm4, mm5 /* mm4 = x6 */
  1458. paddw mm4, mm3 /* mm4 = sum+x6 */
  1459. paddw mm4, mm4 /* mm4 *= 2*/
  1460. paddw mm4, [edi+32] /* +=x2 */
  1461. psubw mm4, [edi+48] /* -=x3 */
  1462. psraw mm4, 4 /* >>=4 */
  1463. psubw mm4, mm5 /* -=x6 */
  1464. pand mm4, mm0 /* and flag */
  1465. paddw mm4, mm5 /* += x6 */
  1466. movq [esi+80], mm4 /* write new x6 */
  1467. /* sum += p2 - x3 */
  1468. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  1469. movq mm5, [edi+112] /* mm5 = x7 */
  1470. psubw mm3, [edi+48] /* -= x3 */
  1471. paddw mm3, mm2 /* += p2 */
  1472. movq mm4, mm5 /* mm4 = x7 */
  1473. paddw mm4, mm3 /* mm4 = sum+x7 */
  1474. paddw mm4, mm4 /* mm4 *= 2*/
  1475. paddw mm4, [edi+48] /* +=x3 */
  1476. psubw mm4, [edi+64] /* -=x4 */
  1477. psraw mm4, 4 /* >>=4 */
  1478. psubw mm4, mm5 /* -=x7 */
  1479. pand mm4, mm0 /* and flag */
  1480. paddw mm4, mm5 /* += x7 */
  1481. movq [esi+96], mm4 /* write new x7 */
  1482. /* sum += p2 - x4 */
  1483. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  1484. movq mm5, [edi+128] /* mm5 = x8 */
  1485. psubw mm3, [edi+64] /* -= x4 */
  1486. paddw mm3, mm2 /* += p2 */
  1487. movq mm4, mm5 /* mm4 = x8 */
  1488. paddw mm4, mm3 /* mm4 = sum+x8 */
  1489. paddw mm4, mm4 /* mm4 *= 2*/
  1490. paddw mm4, [edi+64] /* +=x4 */
  1491. psubw mm4, [edi+80] /* -=x5 */
  1492. psraw mm4, 4 /* >>=4 */
  1493. psubw mm4, mm5 /* -=x8 */
  1494. pand mm4, mm0 /* and flag */
  1495. paddw mm4, mm5 /* += x8 */
  1496. movq [esi+112], mm4 /* write new x8 */
  1497. /* done with right four column */
  1498. /* transpose */
  1499. mov eax, Des /* the destination */
  1500. add edi, 8 /* shift edi to point x1 */
  1501. sub esi, 8 /* shift esi back to left x1 */
  1502. sub eax, 4
  1503. movq mm0, [esi] /* mm0 = 30 20 10 00 */
  1504. movq mm1, [esi+16] /* mm1 = 31 21 11 01 */
  1505. movq mm4, mm0 /* mm4 = 30 20 10 00 */
  1506. punpcklwd mm0, mm1 /* mm0 = 11 10 01 00 */
  1507. punpckhwd mm4, mm1 /* mm4 = 31 30 21 20 */
  1508. movq mm2, [esi+32] /* mm2 = 32 22 12 02 */
  1509. movq mm3, [esi+48] /* mm3 = 33 23 13 03 */
  1510. movq mm5, mm2 /* mm5 = 32 22 12 02 */
  1511. punpcklwd mm2, mm3 /* mm2 = 13 12 03 02 */
  1512. punpckhwd mm5, mm3 /* mm5 = 33 32 23 22 */
  1513. movq mm1, mm0 /* mm1 = 11 10 01 00 */
  1514. punpckldq mm0, mm2 /* mm0 = 03 02 01 00 */
  1515. movq [edi], mm0 /* write 00 01 02 03 */
  1516. punpckhdq mm1, mm2 /* mm1 = 13 12 11 10 */
  1517. movq mm0, mm4 /* mm0 = 31 30 21 20 */
  1518. movq [edi+16], mm1 /* write 10 11 12 13 */
  1519. punpckldq mm0, mm5 /* mm0 = 23 22 21 20 */
  1520. punpckhdq mm4, mm5 /* mm4 = 33 32 31 30 */
  1521. movq mm1, [esi+64] /* mm1 = 34 24 14 04 */
  1522. movq mm2, [esi+80] /* mm2 = 35 25 15 05 */
  1523. movq mm5, [esi+96] /* mm5 = 36 26 16 06 */
  1524. movq mm6, [esi+112] /* mm6 = 37 27 17 07 */
  1525. movq mm3, mm1 /* mm3 = 34 24 14 04 */
  1526. movq mm7, mm5 /* mm7 = 36 26 16 06 */
  1527. punpcklwd mm1, mm2 /* mm1 = 15 14 05 04 */
  1528. punpckhwd mm3, mm2 /* mm3 = 35 34 25 24 */
  1529. punpcklwd mm5, mm6 /* mm5 = 17 16 07 06 */
  1530. punpckhwd mm7, mm6 /* mm7 = 37 36 27 26 */
  1531. movq mm2, mm1 /* mm2 = 15 14 05 04 */
  1532. movq mm6, mm3 /* mm6 = 35 34 25 24 */
  1533. punpckldq mm1, mm5 /* mm1 = 07 06 05 04 */
  1534. punpckhdq mm2, mm5 /* mm2 = 17 16 15 14 */
  1535. punpckldq mm3, mm7 /* mm3 = 27 26 25 24 */
  1536. punpckhdq mm6, mm7 /* mm6 = 37 36 35 34 */
  1537. movq mm5, [edi] /* mm5 = 03 02 01 00 */
  1538. packuswb mm5, mm1 /* mm5 = 07 06 05 04 03 02 01 00 */
  1539. movq [eax], mm5 /* write 00 01 02 03 04 05 06 07 */
  1540. movq mm7, [edi+16] /* mm7 = 13 12 11 10 */
  1541. packuswb mm7, mm2 /* mm7 = 17 16 15 14 13 12 11 10 */
  1542. movq [eax+ecx], mm7 /* write 10 11 12 13 14 15 16 17 */
  1543. packuswb mm0, mm3 /* mm0 = 27 26 25 24 23 22 21 20 */
  1544. packuswb mm4, mm6 /* mm4 = 37 36 35 34 33 32 31 30 */
  1545. movq [eax+ecx*2], mm0 /* write 20 21 22 23 24 25 26 27 */
  1546. lea eax, [eax+ecx*4] /* mov forward the desPtr */
  1547. movq [eax+edx], mm4 /* write 30 31 32 33 34 35 36 37 */
  1548. add edi, 8 /* move to right four column */
  1549. add esi, 8 /* move to right x1 */
  1550. movq mm0, [esi] /* mm0 = 70 60 50 40 */
  1551. movq mm1, [esi+16] /* mm1 = 71 61 51 41 */
  1552. movq mm4, mm0 /* mm4 = 70 60 50 40 */
  1553. punpcklwd mm0, mm1 /* mm0 = 51 50 41 40 */
  1554. punpckhwd mm4, mm1 /* mm4 = 71 70 61 60 */
  1555. movq mm2, [esi+32] /* mm2 = 72 62 52 42 */
  1556. movq mm3, [esi+48] /* mm3 = 73 63 53 43 */
  1557. movq mm5, mm2 /* mm5 = 72 62 52 42 */
  1558. punpcklwd mm2, mm3 /* mm2 = 53 52 43 42 */
  1559. punpckhwd mm5, mm3 /* mm5 = 73 72 63 62 */
  1560. movq mm1, mm0 /* mm1 = 51 50 41 40 */
  1561. punpckldq mm0, mm2 /* mm0 = 43 42 41 40 */
  1562. movq [edi], mm0 /* write 40 41 42 43 */
  1563. punpckhdq mm1, mm2 /* mm1 = 53 52 51 50 */
  1564. movq mm0, mm4 /* mm0 = 71 70 61 60 */
  1565. movq [edi+16], mm1 /* write 50 51 52 53 */
  1566. punpckldq mm0, mm5 /* mm0 = 63 62 61 60 */
  1567. punpckhdq mm4, mm5 /* mm4 = 73 72 71 70 */
  1568. movq mm1, [esi+64] /* mm1 = 74 64 54 44 */
  1569. movq mm2, [esi+80] /* mm2 = 75 65 55 45 */
  1570. movq mm5, [esi+96] /* mm5 = 76 66 56 46 */
  1571. movq mm6, [esi+112] /* mm6 = 77 67 57 47 */
  1572. movq mm3, mm1 /* mm3 = 74 64 54 44 */
  1573. movq mm7, mm5 /* mm7 = 76 66 56 46 */
  1574. punpcklwd mm1, mm2 /* mm1 = 55 54 45 44 */
  1575. punpckhwd mm3, mm2 /* mm3 = 75 74 65 64 */
  1576. punpcklwd mm5, mm6 /* mm5 = 57 56 47 46 */
  1577. punpckhwd mm7, mm6 /* mm7 = 77 76 67 66 */
  1578. movq mm2, mm1 /* mm2 = 55 54 45 44 */
  1579. movq mm6, mm3 /* mm6 = 75 74 65 64 */
  1580. punpckldq mm1, mm5 /* mm1 = 47 46 45 44 */
  1581. punpckhdq mm2, mm5 /* mm2 = 57 56 55 54 */
  1582. punpckldq mm3, mm7 /* mm3 = 67 66 65 64 */
  1583. punpckhdq mm6, mm7 /* mm6 = 77 76 75 74 */
  1584. movq mm5, [edi] /* mm5 = 43 42 41 40 */
  1585. packuswb mm5, mm1 /* mm5 = 47 46 45 44 43 42 41 40 */
  1586. movq [eax], mm5 /* write 40 41 42 43 44 45 46 47 */
  1587. movq mm7, [edi+16] /* mm7 = 53 52 51 50 */
  1588. packuswb mm7, mm2 /* mm7 = 57 56 55 54 53 52 51 50 */
  1589. movq [eax+ecx], mm7 /* write 50 51 52 53 54 55 56 57 */
  1590. packuswb mm0, mm3 /* mm0 = 67 66 65 64 63 62 61 60 */
  1591. packuswb mm4, mm6 /* mm4 = 77 76 75 74 73 72 71 70 */
  1592. movq [eax+ecx*2], mm0 /* write 60 61 62 63 64 65 66 67 */
  1593. lea eax, [eax+ecx*4] /* mov forward the desPtr */
  1594. movq [eax+edx], mm4 /* write 70 71 72 73 74 75 76 77 */
  1595. pop edi
  1596. pop esi
  1597. pop edx
  1598. pop ecx
  1599. pop ebp
  1600. pop eax
  1601. }
  1602. Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
  1603. Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
  1604. pbi->FragmentVariances[CurrentFrag] += Var1;
  1605. Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
  1606. Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
  1607. pbi->FragmentVariances[CurrentFrag + 1] += Var2;
  1608. }
  1609. CurrentFrag ++;
  1610. Src += 8;
  1611. Des += 8;
  1612. }
  1613. }
  1614. /****************************************************************************
  1615. *
  1616. * ROUTINE : DeblockNonFilteredBand_MMX
  1617. *
  1618. * INPUTS : None
  1619. *
  1620. * OUTPUTS : None
  1621. *
  1622. * RETURNS : None
  1623. *
  1624. * FUNCTION : Filter both horizontal and vertical edge in a band
  1625. *
  1626. * SPECIAL NOTES :
  1627. *
  1628. * REFERENCE :
  1629. *
  1630. * ERRORS : None.
  1631. *
  1632. ****************************************************************************/
  1633. void DeblockNonFilteredBand_MMX(
  1634. POSTPROC_INSTANCE *pbi,
  1635. UINT8 *SrcPtr,
  1636. UINT8 *DesPtr,
  1637. UINT32 PlaneLineStep,
  1638. UINT32 FragAcross,
  1639. UINT32 StartFrag,
  1640. UINT32 *QuantScale
  1641. )
  1642. {
  1643. UINT32 j;
  1644. UINT32 CurrentFrag=StartFrag;
  1645. UINT32 QStep;
  1646. UINT32 LoopFLimit;
  1647. UINT8 *Src, *Des;
  1648. UINT32 Var1, Var2;
  1649. #if defined(_WIN32_WCE)
  1650. #pragma pack(16)
  1651. short QStepMmx[4];
  1652. short FLimitMmx[4];
  1653. short LoopFLimitMmx[4];
  1654. short Rows[80];
  1655. short NewRows[64];
  1656. short LoopFilteredValuesUp[4];
  1657. short LoopFilteredValuesDown[4];
  1658. unsigned short Variance11[4];
  1659. unsigned short Variance12[4];
  1660. unsigned short Variance21[4];
  1661. unsigned short Variance22[4];
  1662. #pragma pack()
  1663. #else
  1664. __declspec(align(16)) short QStepMmx[4];
  1665. __declspec(align(16)) short FLimitMmx[4];
  1666. __declspec(align(16)) short LoopFLimitMmx[4];
  1667. __declspec(align(16)) short Rows[80];
  1668. __declspec(align(16)) short NewRows[64];
  1669. __declspec(align(16)) short LoopFilteredValuesUp[4];
  1670. __declspec(align(16)) short LoopFilteredValuesDown[4];
  1671. __declspec(align(16)) unsigned short Variance11[4];
  1672. __declspec(align(16)) unsigned short Variance12[4];
  1673. __declspec(align(16)) unsigned short Variance21[4];
  1674. __declspec(align(16)) unsigned short Variance22[4];
  1675. #endif
  1676. LoopFLimit = DeblockLimitValuesV2[pbi->FrameQIndex];
  1677. LoopFLimitMmx[0] = (INT16)LoopFLimit;
  1678. LoopFLimitMmx[1] = (INT16)LoopFLimit;
  1679. LoopFLimitMmx[2] = (INT16)LoopFLimit;
  1680. LoopFLimitMmx[3] = (INT16)LoopFLimit;
  1681. while(CurrentFrag < StartFrag + FragAcross )
  1682. {
  1683. Src=SrcPtr+8*(CurrentFrag-StartFrag);
  1684. Des=DesPtr+8*(CurrentFrag-StartFrag);
  1685. QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
  1686. __asm
  1687. {
  1688. push eax
  1689. push ebp
  1690. push ecx
  1691. push edx
  1692. push esi
  1693. push edi
  1694. /* Calculate the FLimit and store FLimit and QStep */
  1695. /* Copy the data to the intermediate buffer */
  1696. mov eax, QStep
  1697. xor edx, edx /* clear edx */
  1698. mov ecx, PlaneLineStep /* ecx = Pitch */
  1699. pcmpeqw mm6, mm6
  1700. movd mm5, eax
  1701. mov eax, Src /* eax = Src */
  1702. psrlw mm6, 14 /* mm6 = 3, 3, 3, 3*/
  1703. punpcklwd mm5, mm5
  1704. lea esi, NewRows /* esi = NewRows */
  1705. punpckldq mm5, mm5
  1706. sub edx, ecx /* edx = - Pitch */
  1707. pmullw mm6, mm5 /* Qstep * 3 */
  1708. movq QStepMmx, mm5
  1709. lea edi, Rows /* edi = Rows */
  1710. pxor mm7, mm7 /* Clear mm7 */
  1711. pmullw mm6, mm5
  1712. lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
  1713. movq mm0, [eax + edx] /* mm0 = Src[-5*Pitch] */
  1714. movq mm1, mm0 /* mm1 = mm0 */
  1715. punpcklbw mm0, mm7 /* Lower Four -5 */
  1716. psrlw mm6, 5
  1717. movq [FLimitMmx], mm6
  1718. movq mm2, [eax] /* mm2 = Src[-4*Pitch] */
  1719. punpckhbw mm1, mm7 /* Higher Four -5 */
  1720. movq mm3, mm2 /* mm3 = mm2 */
  1721. punpcklbw mm2, mm7 /* Lower Four -4 */
  1722. movq [edi], mm0 /* Write Lower Four of -5 */
  1723. punpckhbw mm3, mm7 /* higher Four -4 */
  1724. movq [edi+8], mm1 /* Write Higher Four of -5 */
  1725. movq mm4, [eax + ecx] /* mm4 = Src[-3*Pitch] */
  1726. movq [edi+16], mm2 /* Write Lower -4 */
  1727. movq [edi+24], mm3 /* write hight -4 */
  1728. movq mm5, mm4 /* mm5 = mm4 */
  1729. punpcklbw mm4, mm7 /* lower four -3 */
  1730. movq mm0, [eax + ecx *2] /* mm0 = Src[-2*Pitch] */
  1731. punpckhbw mm5, mm7 /* higher four -3 */
  1732. movq mm1, mm0 /* mm1 = mm0 */
  1733. movq [edi+32], mm4 /* write Lower -3 */
  1734. punpcklbw mm0, mm7 /* lower four -2 */
  1735. lea eax, [eax + ecx *4] /* eax = Src */
  1736. movq [edi+40], mm5 /* write Higher -3 */
  1737. punpckhbw mm1, mm7 /* higher four -2 */
  1738. movq mm2, [eax + edx] /* mm2 = Src[-Pitch] */
  1739. movq [edi+48], mm0 /* lower -2 */
  1740. movq mm3, mm2 /* mm3 = mm2 */
  1741. punpcklbw mm2, mm7 /* lower -1 */
  1742. movq [edi+56], mm1 /* higher -2 */
  1743. punpckhbw mm3, mm7 /* Higher -1 */
  1744. movq mm4, [eax] /* mm4 = Src[0] */
  1745. movq [edi+64], mm2 /* Lower -1 */
  1746. movq mm5, mm4 /* mm5 = mm4 */
  1747. movq [edi+72], mm3 /* Higher -1 */
  1748. punpcklbw mm4, mm7 /* lower 0 */
  1749. punpckhbw mm5, mm7 /* higher 0 */
  1750. movq mm0, [eax + ecx] /* mm0 = Src[Pitch] */
  1751. movq [edi+80], mm4 /* write lower 0 */
  1752. movq mm1, mm0 /* mm1 = mm0 */
  1753. movq [edi+88], mm5 /* write higher 0 */
  1754. punpcklbw mm0, mm7 /* lower 1 */
  1755. punpckhbw mm1, mm7 /* higher 1 */
  1756. movq mm2, [eax + ecx *2 ] /* mm2 = Src[2*Pitch] */
  1757. lea eax, [eax + ecx *4] /* eax = Src + 4 * Pitch */
  1758. movq mm3, mm2 /* mm3 = mm2 */
  1759. movq [edi+96], mm0 /* write lower 1 */
  1760. punpcklbw mm2, mm7 /* lower 2 */
  1761. punpckhbw mm3, mm7 /* higher 2 */
  1762. movq mm4, [eax + edx ] /* mm4 = Src[3*pitch] */
  1763. movq [edi+104], mm1 /* wirte higher 1 */
  1764. movq mm5, mm4 /* mm5 = mm4 */
  1765. punpcklbw mm4, mm7 /* Low 3 */
  1766. movq [edi+112], mm2 /* write lower 2 */
  1767. movq [edi+120], mm3 /* write higher 2 */
  1768. movq mm0, [eax] /* mm0 = Src[4*pitch] */
  1769. punpckhbw mm5, mm7 /* high 3 */
  1770. movq mm1, mm0 /* mm1=mm0 */
  1771. movq [edi+128], mm4 /* low 3 */
  1772. punpcklbw mm0, mm7 /* low 4 */
  1773. punpckhbw mm1, mm7 /* high 4 */
  1774. movq [edi+136], mm5 /* high 3 */
  1775. movq [edi+144], mm0 /* low 4 */
  1776. movq [edi+152], mm1 /* high 4 */
  1777. /*
  1778. mov eax, Des
  1779. lea eax, [eax+edx*4]
  1780. movq mm2, [eax]
  1781. movq mm2, [eax+ecx]
  1782. movq mm2, [eax+ecx*2]
  1783. lea eax, [eax+ecx*4]
  1784. movq mm2, [eax+edx]
  1785. movq mm2, [eax]
  1786. movq mm2, [eax+ecx]
  1787. movq mm2, [eax+ecx*2]
  1788. lea eax, [eax+ecx*4]
  1789. movq mm2, [eax+edx]
  1790. movq mm2, [eax]
  1791. */
  1792. /* done with copying everything to intermediate buffer */
  1793. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  1794. /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
  1795. /* mm7 = 0, mm3 = {128, 128, 128, 128} */
  1796. pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
  1797. psllw mm3, 15 /* mm3 = 8000800080008000 */
  1798. psrlw mm3, 8 /* mm3 = 0080008000800080 */
  1799. movq mm2, [edi+16] /* Pixel 1 */
  1800. movq mm6, [edi+80] /* Pixel 5 */
  1801. psubw mm2, mm3 /* mm2 -=128 */
  1802. psubw mm6, mm3 /* mm6 -=128 */
  1803. movq mm0, mm2 /* mm0 = pixel 1 */
  1804. movq mm4, mm6 /* mm4 = pixel 5 */
  1805. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  1806. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  1807. movq mm1, mm2 /* mm1 = pixel1^2 */
  1808. movq mm5, mm6 /* mm5 = pixel5^2 */
  1809. movq mm2, [edi+32] /* Pixel 2 */
  1810. movq mm6, [edi+96] /* Pixel 6 */
  1811. psubw mm2, mm3 /* mm2 -=128 */
  1812. psubw mm6, mm3 /* mm6 -=128 */
  1813. paddw mm0, mm2 /* mm0 += pixel 2 */
  1814. paddw mm4, mm6 /* mm4 += pixel 6 */
  1815. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  1816. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  1817. paddw mm1, mm2 /* mm1 += pixel2^2 */
  1818. paddw mm5, mm6 /* mm5 += pixel6^2 */
  1819. movq mm2, [edi+48] /* Pixel 3 */
  1820. movq mm6, [edi+112] /* Pixel 7 */
  1821. psubw mm2, mm3 /* mm2 -=128 */
  1822. psubw mm6, mm3 /* mm6 -=128 */
  1823. paddw mm0, mm2 /* mm0 += pixel 3 */
  1824. paddw mm4, mm6 /* mm4 += pixel 7 */
  1825. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  1826. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  1827. paddw mm1, mm2 /* mm1 += pixel3^2 */
  1828. paddw mm5, mm6 /* mm5 += pixel7^2 */
  1829. movq mm2, [edi+64] /* Pixel 4 */
  1830. movq mm6, [edi+128] /* Pixel 8 */
  1831. psubw mm2, mm3 /* mm2 -=128 */
  1832. psubw mm6, mm3 /* mm6 -=128 */
  1833. paddw mm0, mm2 /* mm0 += pixel 4 */
  1834. paddw mm4, mm6 /* mm4 += pixel 8 */
  1835. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  1836. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  1837. paddw mm1, mm2 /* mm1 = pixel4^2 */
  1838. paddw mm5, mm6 /* mm5 = pixel8^2 */
  1839. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  1840. /* mm1 = x1 + x2 + x3 + x4 */
  1841. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  1842. /* mm5 = x5 + x6 + x7 + x8 */
  1843. movq mm7, mm3 /* mm7 = mm3 */
  1844. psrlw mm7, 7 /* mm7 = 0001000100010001 */
  1845. movq mm2, mm0 /* make copy of sum1 */
  1846. movq mm6, mm4 /* make copy of sum2 */
  1847. paddw mm0, mm7 /* (sum1 + 1) */
  1848. paddw mm4, mm7 /* (sum2 + 1) */
  1849. psraw mm2, 1 /* sum1 /2 */
  1850. psraw mm6, 1 /* sum2 /2 */
  1851. psraw mm0, 1 /* (sum1 + 1)/2 */
  1852. psraw mm4, 1 /* (sum2 + 1)/2 */
  1853. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  1854. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  1855. psubw mm1, mm2 /* Variance 1 */
  1856. psubw mm5, mm6 /* Variance 2 */
  1857. movq mm7, FLimitMmx /* mm7 = FLimit */
  1858. movq mm2, mm1 /* copy of Varinace 1*/
  1859. movq mm6, mm5 /* Variance 2 */
  1860. movq [Variance11], mm1 /* Save Variance1 */
  1861. movq [Variance21], mm5 /* Save Variance2 */
  1862. psubw mm1, mm7 /* Variance 1 < Flimit? */
  1863. psubw mm5, mm7 /* Variance 2 < Flimit? */
  1864. psraw mm2, 15 /* Variance 1 > 32768? */
  1865. psraw mm6, 15 /* Vaiance 2 > 32768? */
  1866. psraw mm1, 15 /* FFFF/0000 for true/false */
  1867. psraw mm5, 15 /* FFFF/0000 for true/false */
  1868. movq mm7, [edi+64] /* mm0 = Pixel 4 */
  1869. pandn mm2, mm1 /* Variance1<32678 &&
  1870. Variance1<Limit */
  1871. pandn mm6, mm5 /* Variance2<32678 &&
  1872. Variance1<Limit */
  1873. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  1874. pand mm6, mm2 /* mm6 = Variance1 < Flimit */
  1875. /* &&Variance2 < Flimit */
  1876. movq mm2, mm7 /* make copy of Pixel4 */
  1877. psubusw mm7, mm4 /* 4 - 5 */
  1878. psubusw mm4, mm2 /* 5 - 4 */
  1879. por mm7, mm4 /* abs(4 - 5) */
  1880. psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
  1881. psraw mm7, 15 /* FFFF/0000 for True/Flase */
  1882. pand mm7, mm6
  1883. /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1884. /* now lets look at the right four colomn */
  1885. add edi, 8 /* offset 8 to right 4 cols */
  1886. movq mm2, [edi+16] /* Pixel 1 */
  1887. movq mm6, [edi+80] /* Pixel 5 */
  1888. psubw mm2, mm3 /* mm2 -=128 */
  1889. psubw mm6, mm3 /* mm6 -=128 */
  1890. movq mm0, mm2 /* mm0 = pixel 1 */
  1891. movq mm4, mm6 /* mm4 = pixel 5 */
  1892. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  1893. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  1894. movq mm1, mm2 /* mm1 = pixel1^2 */
  1895. movq mm5, mm6 /* mm5 = pixel5^2 */
  1896. movq mm2, [edi+32] /* Pixel 2 */
  1897. movq mm6, [edi+96] /* Pixel 6 */
  1898. psubw mm2, mm3 /* mm2 -=128 */
  1899. psubw mm6, mm3 /* mm6 -=128 */
  1900. paddw mm0, mm2 /* mm0 += pixel 2 */
  1901. paddw mm4, mm6 /* mm4 += pixel 6 */
  1902. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  1903. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  1904. paddw mm1, mm2 /* mm1 += pixel2^2 */
  1905. paddw mm5, mm6 /* mm5 += pixel6^2 */
  1906. movq mm2, [edi+48] /* Pixel 3 */
  1907. movq mm6, [edi+112] /* Pixel 7 */
  1908. psubw mm2, mm3 /* mm2 -=128 */
  1909. psubw mm6, mm3 /* mm6 -=128 */
  1910. paddw mm0, mm2 /* mm0 += pixel 3 */
  1911. paddw mm4, mm6 /* mm4 += pixel 7 */
  1912. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  1913. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  1914. paddw mm1, mm2 /* mm1 += pixel3^2 */
  1915. paddw mm5, mm6 /* mm5 += pixel7^2 */
  1916. movq mm2, [edi+64] /* Pixel 4 */
  1917. movq mm6, [edi+128] /* Pixel 8 */
  1918. psubw mm2, mm3 /* mm2 -=128 */
  1919. psubw mm6, mm3 /* mm6 -=128 */
  1920. paddw mm0, mm2 /* mm0 += pixel 4 */
  1921. paddw mm4, mm6 /* mm4 += pixel 8 */
  1922. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  1923. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  1924. paddw mm1, mm2 /* mm1 = pixel4^2 */
  1925. paddw mm5, mm6 /* mm5 = pixel8^2 */
  1926. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  1927. /* mm1 = x1 + x2 + x3 + x4 */
  1928. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  1929. /* mm5 = x5 + x6 + x7 + x8 */
  1930. psrlw mm3, 7 /* mm3 = 0001000100010001 */
  1931. movq mm2, mm0 /* make copy of sum1 */
  1932. movq mm6, mm4 /* make copy of sum2 */
  1933. paddw mm0, mm3 /* (sum1 + 1) */
  1934. paddw mm4, mm3 /* (sum2 + 1) */
  1935. psraw mm2, 1 /* sum1 /2 */
  1936. psraw mm6, 1 /* sum2 /2 */
  1937. psraw mm0, 1 /* (sum1 + 1)/2 */
  1938. psraw mm4, 1 /* (sum2 + 1)/2 */
  1939. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  1940. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  1941. psubw mm1, mm2 /* Variance 1 */
  1942. psubw mm5, mm6 /* Variance 2 */
  1943. movq [Variance12], mm1 /* Save Variance1 */
  1944. movq [Variance22], mm5 /* Save Variance2 */
  1945. movq mm3, FLimitMmx /* mm3 = FLimit */
  1946. movq mm2, mm1 /* copy of Varinace 1*/
  1947. movq mm6, mm5 /* Variance 2 */
  1948. psubw mm1, mm3 /* Variance 1 < Flimit? */
  1949. psubw mm5, mm3 /* Variance 2 < Flimit? */
  1950. psraw mm2, 15 /* Variance 1 > 32768? */
  1951. psraw mm6, 15 /* Vaiance 2 > 32768? */
  1952. psraw mm1, 15 /* FFFF/0000 for true/false */
  1953. psraw mm5, 15 /* FFFF/0000 for true/false */
  1954. movq mm0, [edi+64] /* mm0 = Pixel 4 */
  1955. pandn mm2, mm1 /* Variance1<32678 &&
  1956. Variance1<Limit */
  1957. pandn mm6, mm5 /* Variance2<32678 &&
  1958. Variance1<Limit */
  1959. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  1960. pand mm6, mm2 /* mm6 = Variance1 < Flimit */
  1961. /* &&Variance2 < Flimit */
  1962. movq mm2, mm0 /* make copy of Pixel4 */
  1963. psubusw mm0, mm4 /* 4 - 5 */
  1964. psubusw mm4, mm2 /* 5 - 4 */
  1965. por mm0, mm4 /* abs(4 - 5) */
  1966. psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
  1967. psraw mm0, 15 /* FFFF/0000 for True/False */
  1968. pand mm0, mm6
  1969. sub edi, 8 /* offset edi back */
  1970. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  1971. /* mm0 and mm7 now are in use */
  1972. /* find the loop filtered values for the pixels on block boundary */
  1973. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  1974. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  1975. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  1976. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  1977. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  1978. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  1979. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  1980. movq mm4, mm5 /* make a copy */
  1981. paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
  1982. paddw mm3, FourFours /* mm3 + 4 */
  1983. paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
  1984. paddw mm3, mm5 /* Filtval before shift */
  1985. psraw mm3, 3 /* FiltVal */
  1986. movq mm2, mm3 /* make a copy */
  1987. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  1988. pxor mm2, mm3
  1989. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  1990. por mm3, FourOnes /* -1 and 1 for + and - */
  1991. movq mm4, mm1 /* make a copy of Flimit */
  1992. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  1993. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  1994. psraw mm1, 15 /* FFFF or 0000 */
  1995. pxor mm5, mm1
  1996. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  1997. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  1998. pmullw mm4, mm3 /* get the sign back */
  1999. movq mm1, [edi+64] /* p[-1] */
  2000. movq mm2, [edi+80] /* p[0] */
  2001. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  2002. psubw mm2, mm4 /* p[0] - NewFiltVal */
  2003. pxor mm6, mm6 /* clear mm6 */
  2004. packuswb mm1, mm1 /* clamping */
  2005. packuswb mm2, mm2 /* clamping */
  2006. punpcklbw mm1, mm6 /* unpack to word */
  2007. movq LoopFilteredValuesUp, mm1 /* save the values */
  2008. punpcklbw mm2, mm6 /* unpack to word */
  2009. movq LoopFilteredValuesDown, mm2 /* save the values */
  2010. /* Let's do the filtering now */
  2011. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  2012. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  2013. movq mm5, [edi] /* mm5 = -5 */
  2014. movq mm4, [edi + 16] /* mm4 = -4 */
  2015. movq mm3, mm4 /* copy of -4 */
  2016. movq mm6, mm5 /* copy of -5 */
  2017. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  2018. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  2019. por mm4, mm5 /* abs([-4]-[-5] ) */
  2020. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  2021. psraw mm4, 15 /* FFFF/0000 for True/False */
  2022. movq mm1, mm4 /* copy of the mm4 */
  2023. pand mm4, mm6 /* */
  2024. pandn mm1, mm3 /* */
  2025. por mm1, mm4 /* mm1 = p1 */
  2026. /* now find P2 */
  2027. movq mm4, [edi+128] /* mm4 = [3] */
  2028. movq mm5, [edi+144] /* mm5 = [4] */
  2029. movq mm3, mm4 /* copy of 3 */
  2030. movq mm6, mm5 /* copy of 4 */
  2031. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  2032. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  2033. por mm4, mm5 /* abs([3]-[4] ) */
  2034. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  2035. psraw mm4, 15 /* FFFF/0000 for True/False */
  2036. movq mm2, mm4 /* copy of the mm4 */
  2037. pand mm4, mm6 /* */
  2038. pandn mm2, mm3 /* */
  2039. por mm2, mm4 /* mm2 = p2 */
  2040. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  2041. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  2042. /* Des[-w4] = Src[-w4]; */
  2043. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  2044. movq mm3, mm1 /* mm3 = p1 */
  2045. paddw mm3, mm3 /* mm3 = p1 + p1 */
  2046. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  2047. movq mm4, [edi+16] /* mm4 = x1 */
  2048. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  2049. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  2050. paddw mm3, [edi+64] /* mm3 += x4 */
  2051. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  2052. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  2053. movq mm4, mm3 /* mm4 = mm3 */
  2054. movq mm5, [edi+16] /* mm5 = x1 */
  2055. paddw mm4, mm5 /* mm4 = sum+x1 */
  2056. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  2057. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  2058. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  2059. psraw mm4, 4 /* mm4 >>=4 */
  2060. psubw mm4, mm5 /* New Value - old Value */
  2061. pand mm4, mm7 /* And the flag */
  2062. paddw mm4, mm5 /* add the old value back */
  2063. movq [esi], mm4 /* Write new x1 */
  2064. /* sum += x5 -p1 */
  2065. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  2066. movq mm5, [edi+32] /* mm5= x2 */
  2067. psubw mm3, mm1 /* sum=sum-p1 */
  2068. paddw mm3, [edi+80] /* sum=sum+x5 */
  2069. movq mm4, mm5 /* copy sum */
  2070. paddw mm4, mm3 /* mm4=sum+x2 */
  2071. paddw mm4, mm4 /* mm4 <<= 1 */
  2072. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  2073. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  2074. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  2075. psubw mm4, mm5 /* new value - old value */
  2076. pand mm4, mm7 /* And the flag */
  2077. paddw mm4, mm5 /* add the old value back */
  2078. movq [esi+16], mm4 /* write new x2 */
  2079. /* sum += x6 - p1 */
  2080. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  2081. movq mm5, [edi+48] /* mm5= x3 */
  2082. psubw mm3, mm1 /* sum=sum-p1 */
  2083. paddw mm3, [edi+96] /* sum=sum+x6 */
  2084. movq mm4, mm5 /* copy x3 */
  2085. paddw mm4, mm3 /* mm4=sum+x3 */
  2086. paddw mm4, mm4 /* mm4 <<= 1 */
  2087. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  2088. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  2089. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  2090. psubw mm4, mm5 /* new value - old value */
  2091. pand mm4, mm7 /* And the flag */
  2092. paddw mm4, mm5 /* add the old value back */
  2093. movq [esi+32], mm4 /* write new x3 */
  2094. /* sum += x7 - p1 */
  2095. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  2096. movq mm5, [edi+64] /* mm5 = x4 */
  2097. psubw mm3, mm1 /* sum = sum-p1 */
  2098. paddw mm3, [edi+112] /* sum = sum+x7 */
  2099. movq mm4, mm5 /* mm4 = x4 */
  2100. paddw mm4, mm3 /* mm4 = sum + x4 */
  2101. paddw mm4, mm4 /* mm4 *=2 */
  2102. paddw mm4, mm1 /* += p1 */
  2103. psubw mm4, [edi+16] /* -= x1 */
  2104. psubw mm4, [edi+112] /* -= x7 */
  2105. paddw mm4, [edi+128] /* += x8 */
  2106. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  2107. psraw mm4, 4 /* >>=4 */
  2108. psubw mm4, mm5 /* -=x4 */
  2109. pand mm4, mm7 /* and flag */
  2110. paddw mm4, mm5 /* += x4 */
  2111. movq [esi+48], mm4 /* write new x4 */
  2112. /* sum+= x8-x1 */
  2113. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  2114. movq mm5, [edi+80] /* mm5 = x5 */
  2115. psubw mm3, [edi+16] /* sum -= x1 */
  2116. paddw mm3, [edi+128] /* sub += x8 */
  2117. movq mm4, mm5 /* mm4 = x5 */
  2118. paddw mm4, mm3 /* mm4= sum+x5 */
  2119. paddw mm4, mm4 /* mm4 *= 2 */
  2120. paddw mm4, [edi+16] /* += x1 */
  2121. psubw mm4, [edi+32] /* -= x2 */
  2122. psubw mm4, [edi+128] /* -= x8 */
  2123. paddw mm4, mm2 /* += p2 */
  2124. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  2125. psraw mm4, 4 /* >>=4 */
  2126. psubw mm4, mm5 /* -=x5 */
  2127. pand mm4, mm7 /* and flag */
  2128. paddw mm4, mm5 /* += x5 */
  2129. movq [esi+64], mm4 /* write new x5 */
  2130. /* sum += p2 - x2 */
  2131. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  2132. movq mm5, [edi+96] /* mm5 = x6 */
  2133. psubw mm3, [edi+32] /* -= x2 */
  2134. paddw mm3, mm2 /* += p2 */
  2135. movq mm4, mm5 /* mm4 = x6 */
  2136. paddw mm4, mm3 /* mm4 = sum+x6 */
  2137. paddw mm4, mm4 /* mm4 *= 2*/
  2138. paddw mm4, [edi+32] /* +=x2 */
  2139. psubw mm4, [edi+48] /* -=x3 */
  2140. psraw mm4, 4 /* >>=4 */
  2141. psubw mm4, mm5 /* -=x6 */
  2142. pand mm4, mm7 /* and flag */
  2143. paddw mm4, mm5 /* += x6 */
  2144. movq [esi+80], mm4 /* write new x6 */
  2145. /* sum += p2 - x3 */
  2146. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  2147. movq mm5, [edi+112] /* mm5 = x7 */
  2148. psubw mm3, [edi+48] /* -= x3 */
  2149. paddw mm3, mm2 /* += p2 */
  2150. movq mm4, mm5 /* mm4 = x7 */
  2151. paddw mm4, mm3 /* mm4 = sum+x7 */
  2152. paddw mm4, mm4 /* mm4 *= 2*/
  2153. paddw mm4, [edi+48] /* +=x3 */
  2154. psubw mm4, [edi+64] /* -=x4 */
  2155. psraw mm4, 4 /* >>=4 */
  2156. psubw mm4, mm5 /* -=x7 */
  2157. pand mm4, mm7 /* and flag */
  2158. paddw mm4, mm5 /* += x7 */
  2159. movq [esi+96], mm4 /* write new x7 */
  2160. /* sum += p2 - x4 */
  2161. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  2162. movq mm5, [edi+128] /* mm5 = x8 */
  2163. psubw mm3, [edi+64] /* -= x4 */
  2164. paddw mm3, mm2 /* += p2 */
  2165. movq mm4, mm5 /* mm4 = x8 */
  2166. paddw mm4, mm3 /* mm4 = sum+x8 */
  2167. paddw mm4, mm4 /* mm4 *= 2*/
  2168. paddw mm4, [edi+64] /* +=x4 */
  2169. psubw mm4, [edi+80] /* -=x5 */
  2170. psraw mm4, 4 /* >>=4 */
  2171. psubw mm4, mm5 /* -=x8 */
  2172. pand mm4, mm7 /* and flag */
  2173. paddw mm4, mm5 /* += x8 */
  2174. movq [esi+112], mm4 /* write new x8 */
  2175. /* done with left four columns */
  2176. /* now do the righ four columns */
  2177. add edi, 8 /* shift to right four column */
  2178. add esi, 8 /* shift to right four column */
  2179. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  2180. /* mm0 now are in use */
  2181. /* find the loop filtered values for the pixels on block boundary */
  2182. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  2183. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  2184. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  2185. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  2186. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  2187. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  2188. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  2189. movq mm4, mm5 /* make a copy */
  2190. paddw mm3, FourFours /* mm3 + 4 */
  2191. paddw mm4, mm4 /* 2 * ( p[0] - p[-1] ) */
  2192. paddw mm3, mm4 /* 3 * ( p[0] - p[-1] ) */
  2193. paddw mm3, mm5 /* Filtval before shift */
  2194. psraw mm3, 3 /* FiltVal */
  2195. movq mm2, mm3 /* make a copy */
  2196. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  2197. pxor mm2, mm3
  2198. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  2199. por mm3, FourOnes /* -1 and 1 for + and - */
  2200. movq mm4, mm1 /* make a copy of Flimit */
  2201. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  2202. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  2203. psraw mm1, 15 /* FFFF or 0000 */
  2204. pxor mm5, mm1
  2205. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  2206. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  2207. pmullw mm4, mm3 /* get the sign back */
  2208. movq mm1, [edi+64] /* p[-1] */
  2209. movq mm2, [edi+80] /* p[0] */
  2210. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  2211. psubw mm2, mm4 /* p[0] - NewFiltVal */
  2212. pxor mm6, mm6 /* clear mm6 */
  2213. packuswb mm1, mm1 /* clamping */
  2214. packuswb mm2, mm2 /* clamping */
  2215. punpcklbw mm1, mm6 /* unpack to word */
  2216. movq LoopFilteredValuesUp, mm1 /* save the values */
  2217. punpcklbw mm2, mm6 /* unpack to word */
  2218. movq LoopFilteredValuesDown, mm2 /* save the values */
  2219. /* Let's do the filtering now */
  2220. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  2221. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  2222. movq mm5, [edi] /* mm5 = -5 */
  2223. movq mm4, [edi + 16] /* mm4 = -4 */
  2224. movq mm3, mm4 /* copy of -4 */
  2225. movq mm6, mm5 /* copy of -5 */
  2226. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  2227. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  2228. por mm4, mm5 /* abs([-4]-[-5] ) */
  2229. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  2230. psraw mm4, 15 /* FFFF/0000 for True/False */
  2231. movq mm1, mm4 /* copy of the mm4 */
  2232. pand mm4, mm6 /* */
  2233. pandn mm1, mm3 /* */
  2234. por mm1, mm4 /* mm1 = p1 */
  2235. /* now find P2 */
  2236. movq mm4, [edi+128] /* mm4 = [3] */
  2237. movq mm5, [edi+144] /* mm5 = [4] */
  2238. movq mm3, mm4 /* copy of 3 */
  2239. movq mm6, mm5 /* copy of 4 */
  2240. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  2241. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  2242. por mm4, mm5 /* abs([3]-[4] ) */
  2243. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  2244. psraw mm4, 15 /* FFFF/0000 for True/False */
  2245. movq mm2, mm4 /* copy of the mm4 */
  2246. pand mm4, mm6 /* */
  2247. pandn mm2, mm3 /* */
  2248. por mm2, mm4 /* mm2 = p2 */
  2249. /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
  2250. /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
  2251. /* Des[-w4]=Src[-w4]; */
  2252. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  2253. movq mm3, mm1 /* mm3 = p1 */
  2254. paddw mm3, mm3 /* mm3 = p1 + p1 */
  2255. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  2256. movq mm4, [edi+16] /* mm4 = x1 */
  2257. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  2258. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  2259. paddw mm3, [edi+64] /* mm3 += x4 */
  2260. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  2261. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  2262. movq mm4, mm3 /* mm4 = mm3 */
  2263. movq mm5, [edi+16] /* mm5 = x1 */
  2264. paddw mm4, mm5 /* mm4 = sum+x1 */
  2265. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  2266. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  2267. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  2268. psraw mm4, 4 /* mm4 >>=4 */
  2269. psubw mm4, mm5 /* New Value - old Value */
  2270. pand mm4, mm0 /* And the flag */
  2271. paddw mm4, mm5 /* add the old value back */
  2272. movq [esi], mm4 /* Write new x1 */
  2273. /* sum += x5 -p1 */
  2274. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  2275. movq mm5, [edi+32] /* mm5= x2 */
  2276. psubw mm3, mm1 /* sum=sum-p1 */
  2277. paddw mm3, [edi+80] /* sum=sum+x5 */
  2278. movq mm4, mm5 /* copy sum */
  2279. paddw mm4, mm3 /* mm4=sum+x2 */
  2280. paddw mm4, mm4 /* mm4 <<= 1 */
  2281. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  2282. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  2283. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  2284. psubw mm4, mm5 /* new value - old value */
  2285. pand mm4, mm0 /* And the flag */
  2286. paddw mm4, mm5 /* add the old value back */
  2287. movq [esi+16], mm4 /* write new x2 */
  2288. /* sum += x6 - p1 */
  2289. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  2290. movq mm5, [edi+48] /* mm5= x3 */
  2291. psubw mm3, mm1 /* sum=sum-p1 */
  2292. paddw mm3, [edi+96] /* sum=sum+x6 */
  2293. movq mm4, mm5 /* copy x3 */
  2294. paddw mm4, mm3 /* mm4=sum+x3 */
  2295. paddw mm4, mm4 /* mm4 <<= 1 */
  2296. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  2297. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  2298. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  2299. psubw mm4, mm5 /* new value - old value */
  2300. pand mm4, mm0 /* And the flag */
  2301. paddw mm4, mm5 /* add the old value back */
  2302. movq [esi+32], mm4 /* write new x3 */
  2303. /* sum += x7 - p1 */
  2304. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  2305. movq mm5, [edi+64] /* mm5 = x4 */
  2306. psubw mm3, mm1 /* sum = sum-p1 */
  2307. paddw mm3, [edi+112] /* sum = sum+x7 */
  2308. movq mm4, mm5 /* mm4 = x4 */
  2309. paddw mm4, mm3 /* mm4 = sum + x4 */
  2310. paddw mm4, mm4 /* mm4 *=2 */
  2311. paddw mm4, mm1 /* += p1 */
  2312. psubw mm4, [edi+16] /* -= x1 */
  2313. psubw mm4, [edi+112] /* -= x7 */
  2314. paddw mm4, [edi+128] /* += x8 */
  2315. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  2316. psraw mm4, 4 /* >>=4 */
  2317. psubw mm4, mm5 /* -=x4 */
  2318. pand mm4, mm0 /* and flag */
  2319. paddw mm4, mm5 /* += x4 */
  2320. movq [esi+48], mm4 /* write new x4 */
  2321. /* sum+= x8-x1 */
  2322. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  2323. movq mm5, [edi+80] /* mm5 = x5 */
  2324. psubw mm3, [edi+16] /* sum -= x1 */
  2325. paddw mm3, [edi+128] /* sub += x8 */
  2326. movq mm4, mm5 /* mm4 = x5 */
  2327. paddw mm4, mm3 /* mm4= sum+x5 */
  2328. paddw mm4, mm4 /* mm4 *= 2 */
  2329. paddw mm4, [edi+16] /* += x1 */
  2330. psubw mm4, [edi+32] /* -= x2 */
  2331. psubw mm4, [edi+128] /* -= x8 */
  2332. paddw mm4, mm2 /* += p2 */
  2333. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  2334. psraw mm4, 4 /* >>=4 */
  2335. psubw mm4, mm5 /* -=x5 */
  2336. pand mm4, mm0 /* and flag */
  2337. paddw mm4, mm5 /* += x5 */
  2338. movq [esi+64], mm4 /* write new x5 */
  2339. /* sum += p2 - x2 */
  2340. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  2341. movq mm5, [edi+96] /* mm5 = x6 */
  2342. psubw mm3, [edi+32] /* -= x2 */
  2343. paddw mm3, mm2 /* += p2 */
  2344. movq mm4, mm5 /* mm4 = x6 */
  2345. paddw mm4, mm3 /* mm4 = sum+x6 */
  2346. paddw mm4, mm4 /* mm4 *= 2*/
  2347. paddw mm4, [edi+32] /* +=x2 */
  2348. psubw mm4, [edi+48] /* -=x3 */
  2349. psraw mm4, 4 /* >>=4 */
  2350. psubw mm4, mm5 /* -=x6 */
  2351. pand mm4, mm0 /* and flag */
  2352. paddw mm4, mm5 /* += x6 */
  2353. movq [esi+80], mm4 /* write new x6 */
  2354. /* sum += p2 - x3 */
  2355. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  2356. movq mm5, [edi+112] /* mm5 = x7 */
  2357. psubw mm3, [edi+48] /* -= x3 */
  2358. paddw mm3, mm2 /* += p2 */
  2359. movq mm4, mm5 /* mm4 = x7 */
  2360. paddw mm4, mm3 /* mm4 = sum+x7 */
  2361. paddw mm4, mm4 /* mm4 *= 2*/
  2362. paddw mm4, [edi+48] /* +=x3 */
  2363. psubw mm4, [edi+64] /* -=x4 */
  2364. psraw mm4, 4 /* >>=4 */
  2365. psubw mm4, mm5 /* -=x7 */
  2366. pand mm4, mm0 /* and flag */
  2367. paddw mm4, mm5 /* += x7 */
  2368. movq [esi+96], mm4 /* write new x7 */
  2369. /* sum += p2 - x4 */
  2370. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  2371. movq mm5, [edi+128] /* mm5 = x8 */
  2372. psubw mm3, [edi+64] /* -= x4 */
  2373. paddw mm3, mm2 /* += p2 */
  2374. movq mm4, mm5 /* mm4 = x8 */
  2375. paddw mm4, mm3 /* mm4 = sum+x8 */
  2376. paddw mm4, mm4 /* mm4 *= 2*/
  2377. paddw mm4, [edi+64] /* +=x4 */
  2378. psubw mm4, [edi+80] /* -=x5 */
  2379. psraw mm4, 4 /* >>=4 */
  2380. psubw mm4, mm5 /* -=x8 */
  2381. pand mm4, mm0 /* and flag */
  2382. paddw mm4, mm5 /* += x8 */
  2383. movq [esi+112], mm4 /* write new x8 */
  2384. /* done with right four column */
  2385. add edi, 8 /* shift edi to point x1 */
  2386. sub esi, 8 /* shift esi back to x1 */
  2387. mov ebp, Des /* the destination */
  2388. lea ebp, [ebp + edx *4] /* point to des[-w4] */
  2389. movq mm0, [esi]
  2390. packuswb mm0, [esi + 8]
  2391. movq [ebp], mm0 /* write des[-w4] */
  2392. movq mm1, [esi + 16]
  2393. packuswb mm1, [esi + 24]
  2394. movq [ebp+ecx ], mm1 /* write des[-w3] */
  2395. movq mm2, [esi + 32]
  2396. packuswb mm2, [esi + 40]
  2397. movq [ebp+ecx*2 ], mm2 /* write des[-w2] */
  2398. movq mm3, [esi + 48]
  2399. packuswb mm3, [esi + 56]
  2400. lea ebp, [ebp+ecx*4] /* point to des[0] */
  2401. movq [ebp+edx], mm3 /* write des[-w1] */
  2402. movq mm0, [esi + 64]
  2403. packuswb mm0, [esi + 72]
  2404. movq [ebp ], mm0 /* write des[0] */
  2405. movq mm1, [esi + 80]
  2406. packuswb mm1, [esi + 88]
  2407. movq [ebp+ecx], mm1 /* write des[w1] */
  2408. movq mm2, [esi + 96]
  2409. packuswb mm2, [esi + 104]
  2410. movq [ebp+ecx*2], mm2 /* write des[w2] */
  2411. movq mm3, [esi + 112]
  2412. packuswb mm3, [esi + 120]
  2413. lea ebp, [ebp+ecx*2] /* point to des[w4] */
  2414. movq [ebp+ecx], mm3 /* write des[w3] */
  2415. pop edi
  2416. pop esi
  2417. pop edx
  2418. pop ecx
  2419. pop ebp
  2420. pop eax
  2421. } /* end of the macro */
  2422. Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
  2423. Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
  2424. pbi->FragmentVariances[CurrentFrag] += Var1;
  2425. Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
  2426. Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
  2427. pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
  2428. if(CurrentFrag==StartFrag)
  2429. CurrentFrag++;
  2430. else
  2431. {
  2432. Des=DesPtr-8*PlaneLineStep+8*(CurrentFrag-StartFrag);
  2433. Src=Des;
  2434. QStep = QuantScale[pbi->FragQIndex[CurrentFrag]];
  2435. for( j=0; j<8;j++)
  2436. {
  2437. Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
  2438. Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
  2439. }
  2440. __asm
  2441. {
  2442. /* Save the registers */
  2443. push eax
  2444. push ebp
  2445. /* Calculate the FLimit and store FLimit and QStep */
  2446. mov eax, QStep /* get QStep */
  2447. movd mm0, eax /* mm0 = 0, 0, 0, Q */
  2448. push ecx
  2449. punpcklwd mm0, mm0 /* mm0 = 0, 0, Q, Q */
  2450. movq mm1, FourThrees /* mm1 = 03 03 03 03 */
  2451. push edx
  2452. punpckldq mm0, mm0 /* mm0 = Q, Q, Q, Q */
  2453. movq QStepMmx, mm0 /* write the Q step */
  2454. push esi
  2455. pmullw mm1, mm0 /* mm1 = QStep * 3 */
  2456. pmullw mm1, mm0 /* mm1 = QStep * QStep * 3 */
  2457. push edi
  2458. psrlw mm1, 5 /* mm1 = FLimit */
  2459. movq [FLimitMmx], mm1 /* Save FLimit */
  2460. /* setup the pointers to data */
  2461. mov eax, Src /* eax = Src */
  2462. xor edx, edx /* clear edx */
  2463. sub eax, 4 /* eax = Src-4 */
  2464. lea esi, NewRows /* esi = NewRows */
  2465. lea edi, Rows /* edi = Rows */
  2466. mov ecx, PlaneLineStep /* ecx = Pitch */
  2467. sub edx, ecx /* edx = -Pitch */
  2468. /* Get the data to the intermediate buffer */
  2469. movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
  2470. movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
  2471. movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
  2472. lea eax, [eax+ecx*4] /* Go down four Rows */
  2473. movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
  2474. movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
  2475. punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
  2476. punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
  2477. movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
  2478. punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
  2479. punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
  2480. movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
  2481. punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
  2482. punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
  2483. movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
  2484. punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
  2485. punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
  2486. pxor mm7, mm7 /* clear mm7 */
  2487. movq mm5, mm0 /* make a copy */
  2488. punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
  2489. movq [edi+16], mm0 /* write 00 10 20 30 */
  2490. punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
  2491. movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
  2492. movq [edi+32], mm5 /* write 01 11 21 31 */
  2493. punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
  2494. punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
  2495. movq [edi+48], mm1 /* write 02 12 22 32 */
  2496. movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
  2497. movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
  2498. movq [edi+64], mm0 /* write 03 13 23 33 */
  2499. punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
  2500. punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
  2501. movq [edi+80], mm2 /* write 04 14 24 34 */
  2502. punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
  2503. punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
  2504. movq [edi+96], mm3 /* write 05 15 25 35 */
  2505. movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
  2506. movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
  2507. movq [edi+112], mm4 /* write 06 16 26 37 */
  2508. movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
  2509. lea eax, [eax+ ecx*4] /* Go down four rows */
  2510. movq [edi+128], mm5 /* write 07 17 27 37 */
  2511. movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
  2512. movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
  2513. punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
  2514. punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
  2515. movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
  2516. punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
  2517. punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
  2518. movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
  2519. punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
  2520. punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
  2521. movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
  2522. punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
  2523. punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
  2524. movq mm5, mm0 /* make a copy */
  2525. punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
  2526. movq [edi+24], mm0 /* write 40 50 60 70 */
  2527. punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
  2528. movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
  2529. movq [edi+40], mm5 /* write 41 51 61 71 */
  2530. punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
  2531. punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
  2532. movq [edi+56], mm1 /* write 42 52 62 72 */
  2533. movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
  2534. movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
  2535. movq [edi+72], mm0 /* write 43 53 63 73 */
  2536. punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
  2537. punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
  2538. movq [edi+88], mm2 /* write 44 54 64 74 */
  2539. punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
  2540. punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
  2541. movq [edi+104], mm3 /* write 45 55 65 75 */
  2542. movq [edi+120], mm4 /* write 46 56 66 76 */
  2543. movq [edi+136], mm5 /* write 47 57 67 77 */
  2544. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  2545. /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
  2546. /* mm7 = 0, mm3 = {128, 128, 128, 128} */
  2547. pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
  2548. psllw mm3, 15 /* mm3 = 8000800080008000 */
  2549. psrlw mm3, 8 /* mm3 = 0080008000800080 */
  2550. movq mm2, [edi+16] /* Pixel 1 */
  2551. movq mm6, [edi+80] /* Pixel 5 */
  2552. psubw mm2, mm3 /* mm2 -=128 */
  2553. psubw mm6, mm3 /* mm6 -=128 */
  2554. movq mm0, mm2 /* mm0 = pixel 1 */
  2555. movq mm4, mm6 /* mm4 = pixel 5 */
  2556. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  2557. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  2558. movq mm1, mm2 /* mm1 = pixel1^2 */
  2559. movq mm5, mm6 /* mm5 = pixel5^2 */
  2560. movq mm2, [edi+32] /* Pixel 2 */
  2561. movq mm6, [edi+96] /* Pixel 6 */
  2562. psubw mm2, mm3 /* mm2 -=128 */
  2563. psubw mm6, mm3 /* mm6 -=128 */
  2564. paddw mm0, mm2 /* mm0 += pixel 2 */
  2565. paddw mm4, mm6 /* mm4 += pixel 6 */
  2566. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  2567. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  2568. paddw mm1, mm2 /* mm1 += pixel2^2 */
  2569. paddw mm5, mm6 /* mm5 += pixel6^2 */
  2570. movq mm2, [edi+48] /* Pixel 3 */
  2571. movq mm6, [edi+112] /* Pixel 7 */
  2572. psubw mm2, mm3 /* mm2 -=128 */
  2573. psubw mm6, mm3 /* mm6 -=128 */
  2574. paddw mm0, mm2 /* mm0 += pixel 3 */
  2575. paddw mm4, mm6 /* mm4 += pixel 7 */
  2576. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  2577. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  2578. paddw mm1, mm2 /* mm1 += pixel3^2 */
  2579. paddw mm5, mm6 /* mm5 += pixel7^2 */
  2580. movq mm2, [edi+64] /* Pixel 4 */
  2581. movq mm6, [edi+128] /* Pixel 8 */
  2582. psubw mm2, mm3 /* mm2 -=128 */
  2583. psubw mm6, mm3 /* mm6 -=128 */
  2584. paddw mm0, mm2 /* mm0 += pixel 4 */
  2585. paddw mm4, mm6 /* mm4 += pixel 8 */
  2586. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  2587. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  2588. paddw mm1, mm2 /* mm1 = pixel4^2 */
  2589. paddw mm5, mm6 /* mm5 = pixel8^2 */
  2590. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  2591. /* mm1 = x1 + x2 + x3 + x4 */
  2592. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  2593. /* mm5 = x5 + x6 + x7 + x8 */
  2594. movq mm7, mm3 /* mm7 = mm3 */
  2595. psrlw mm7, 7 /* mm7 = 0001000100010001 */
  2596. movq mm2, mm0 /* make copy of sum1 */
  2597. movq mm6, mm4 /* make copy of sum2 */
  2598. paddw mm0, mm7 /* (sum1 + 1) */
  2599. paddw mm4, mm7 /* (sum2 + 1) */
  2600. psraw mm2, 1 /* sum1 /2 */
  2601. psraw mm6, 1 /* sum2 /2 */
  2602. psraw mm0, 1 /* (sum1 + 1)/2 */
  2603. psraw mm4, 1 /* (sum2 + 1)/2 */
  2604. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  2605. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  2606. psubw mm1, mm2 /* Variance 1 */
  2607. psubw mm5, mm6 /* Variance 2 */
  2608. movq [Variance11], mm1 /* Save Variance1 */
  2609. movq [Variance21], mm5 /* Save Variance2 */
  2610. movq mm7, FLimitMmx /* mm7 = FLimit */
  2611. movq mm2, mm1 /* copy of Variance 1*/
  2612. movq mm6, mm5 /* copy of Variance 2*/
  2613. psubw mm1, mm7 /* Variance 1 < Flimit? */
  2614. psubw mm5, mm7 /* Variance 2 < Flimit? */
  2615. psraw mm1, 15 /* FFFF/0000 for true/false */
  2616. psraw mm5, 15 /* FFFF/0000 for true/false */
  2617. psraw mm2, 15 /* Variance 1 > 32768 ? */
  2618. psraw mm6, 15 /* Variance 2 > 32768 ? */
  2619. movq mm7, [edi+64] /* mm0 = Pixel 4 */
  2620. pandn mm2, mm1 /* Variance 1 < Flimit &&
  2621. Variance 1 < 32768 */
  2622. pandn mm6, mm5 /* Variance 2 < Flimit &&
  2623. Variance 2 < 32768 */
  2624. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  2625. pand mm6, mm2 /* mm1 = Variance1 < Flimit */
  2626. /* &&Variance2 < Flimit */
  2627. movq mm2, mm7 /* make copy of Pixel4 */
  2628. psubusw mm7, mm4 /* 4 - 5 */
  2629. psubusw mm4, mm2 /* 5 - 4 */
  2630. por mm7, mm4 /* abs(4 - 5) */
  2631. psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
  2632. psraw mm7, 15 /* FFFF/0000 for True/Flase */
  2633. pand mm7, mm6
  2634. /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  2635. /* now lets look at the right four colomn */
  2636. add edi, 8 /* offset 8 to right 4 cols */
  2637. movq mm2, [edi+16] /* Pixel 1 */
  2638. movq mm6, [edi+80] /* Pixel 5 */
  2639. psubw mm2, mm3 /* mm2 -=128 */
  2640. psubw mm6, mm3 /* mm6 -=128 */
  2641. movq mm0, mm2 /* mm0 = pixel 1 */
  2642. movq mm4, mm6 /* mm4 = pixel 5 */
  2643. pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
  2644. pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
  2645. movq mm1, mm2 /* mm1 = pixel1^2 */
  2646. movq mm5, mm6 /* mm5 = pixel5^2 */
  2647. movq mm2, [edi+32] /* Pixel 2 */
  2648. movq mm6, [edi+96] /* Pixel 6 */
  2649. psubw mm2, mm3 /* mm2 -=128 */
  2650. psubw mm6, mm3 /* mm6 -=128 */
  2651. paddw mm0, mm2 /* mm0 += pixel 2 */
  2652. paddw mm4, mm6 /* mm4 += pixel 6 */
  2653. pmullw mm2, mm2 /* mm2 = pixel2^2 */
  2654. pmullw mm6, mm6 /* mm6 = pixel6^2 */
  2655. paddw mm1, mm2 /* mm1 += pixel2^2 */
  2656. paddw mm5, mm6 /* mm5 += pixel6^2 */
  2657. movq mm2, [edi+48] /* Pixel 3 */
  2658. movq mm6, [edi+112] /* Pixel 7 */
  2659. psubw mm2, mm3 /* mm2 -=128 */
  2660. psubw mm6, mm3 /* mm6 -=128 */
  2661. paddw mm0, mm2 /* mm0 += pixel 3 */
  2662. paddw mm4, mm6 /* mm4 += pixel 7 */
  2663. pmullw mm2, mm2 /* mm2 = pixel3^2 */
  2664. pmullw mm6, mm6 /* mm6 = pixel7^2 */
  2665. paddw mm1, mm2 /* mm1 += pixel3^2 */
  2666. paddw mm5, mm6 /* mm5 += pixel7^2 */
  2667. movq mm2, [edi+64] /* Pixel 4 */
  2668. movq mm6, [edi+128] /* Pixel 8 */
  2669. psubw mm2, mm3 /* mm2 -=128 */
  2670. psubw mm6, mm3 /* mm6 -=128 */
  2671. paddw mm0, mm2 /* mm0 += pixel 4 */
  2672. paddw mm4, mm6 /* mm4 += pixel 8 */
  2673. pmullw mm2, mm2 /* mm2 = pixel4^2 */
  2674. pmullw mm6, mm6 /* mm6 = pixel8^2 */
  2675. paddw mm1, mm2 /* mm1 = pixel4^2 */
  2676. paddw mm5, mm6 /* mm5 = pixel8^2 */
  2677. /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
  2678. /* mm1 = x1 + x2 + x3 + x4 */
  2679. /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
  2680. /* mm5 = x5 + x6 + x7 + x8 */
  2681. psrlw mm3, 7 /* mm3 = 0001000100010001 */
  2682. movq mm2, mm0 /* make copy of sum1 */
  2683. movq mm6, mm4 /* make copy of sum2 */
  2684. paddw mm0, mm3 /* (sum1 + 1) */
  2685. paddw mm4, mm3 /* (sum2 + 1) */
  2686. psraw mm2, 1 /* sum1 /2 */
  2687. psraw mm6, 1 /* sum2 /2 */
  2688. psraw mm0, 1 /* (sum1 + 1)/2 */
  2689. psraw mm4, 1 /* (sum2 + 1)/2 */
  2690. pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
  2691. pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
  2692. psubw mm1, mm2 /* Variance 1 */
  2693. psubw mm5, mm6 /* Variance 2 */
  2694. movq [Variance12], mm1 /* Save Variance1 */
  2695. movq [Variance22], mm5 /* Save Variance2 */
  2696. movq mm3, FLimitMmx /* mm3 = FLimit */
  2697. movq mm2, mm1 /* copy of Varinace 1*/
  2698. movq mm6, mm5 /* Variance 2 */
  2699. psubw mm1, mm3 /* Variance 1 < Flimit? */
  2700. psubw mm5, mm3 /* Variance 2 < Flimit? */
  2701. psraw mm6, 15 /* Variance 1 > 32768 */
  2702. psraw mm2, 15 /* Variance 2 > 32768 */
  2703. psraw mm1, 15 /* FFFF/0000 for true/false */
  2704. psraw mm5, 15 /* FFFF/0000 for true/false */
  2705. movq mm0, [edi+64] /* mm0 = Pixel 4 */
  2706. pandn mm2, mm1 /* Variance1<32678 &&
  2707. Variance1<Limit */
  2708. pandn mm6, mm5 /* Variance2<32678 &&
  2709. Variance1<Limit */
  2710. movq mm4, [edi+80] /* mm4 = Pixel 5 */
  2711. pand mm6, mm2 /* mm1 = Variance1 < Flimit */
  2712. /* &&Variance2 < Flimit */
  2713. movq mm2, mm0 /* make copy of Pixel4 */
  2714. psubusw mm0, mm4 /* 4 - 5 */
  2715. psubusw mm4, mm2 /* 5 - 4 */
  2716. por mm0, mm4 /* abs(4 - 5) */
  2717. psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
  2718. psraw mm0, 15 /* FFFF/0000 for True/False */
  2719. pand mm0, mm6
  2720. sub edi, 8 /* offset edi back */
  2721. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  2722. /* mm0 and mm7 now are in use */
  2723. /* find the loop filtered values for the pixels on block boundary */
  2724. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  2725. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  2726. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  2727. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  2728. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  2729. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  2730. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  2731. movq mm4, mm5 /* make a copy */
  2732. paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
  2733. paddw mm3, FourFours /* mm3 + 4 */
  2734. paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
  2735. paddw mm3, mm5 /* Filtval before shift */
  2736. psraw mm3, 3 /* FiltVal */
  2737. movq mm2, mm3 /* make a copy */
  2738. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  2739. pxor mm2, mm3
  2740. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  2741. por mm3, FourOnes /* -1 and 1 for + and - */
  2742. movq mm4, mm1 /* make a copy of Flimit */
  2743. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  2744. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  2745. psraw mm1, 15 /* FFFF or 0000 */
  2746. pxor mm5, mm1
  2747. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  2748. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  2749. pmullw mm4, mm3 /* get the sign back */
  2750. movq mm1, [edi+64] /* p[-1] */
  2751. movq mm2, [edi+80] /* p[0] */
  2752. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  2753. psubw mm2, mm4 /* p[0] - NewFiltVal */
  2754. pxor mm6, mm6 /* clear mm6 */
  2755. packuswb mm1, mm1 /* clamping */
  2756. packuswb mm2, mm2 /* clamping */
  2757. punpcklbw mm1, mm6 /* unpack to word */
  2758. movq LoopFilteredValuesUp, mm1 /* save the values */
  2759. punpcklbw mm2, mm6 /* unpack to word */
  2760. movq LoopFilteredValuesDown, mm2 /* save the values */
  2761. /* Let's do the filtering now */
  2762. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  2763. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  2764. movq mm5, [edi] /* mm5 = -5 */
  2765. movq mm4, [edi + 16] /* mm4 = -4 */
  2766. movq mm3, mm4 /* copy of -4 */
  2767. movq mm6, mm5 /* copy of -5 */
  2768. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  2769. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  2770. por mm4, mm5 /* abs([-4]-[-5] ) */
  2771. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  2772. psraw mm4, 15 /* FFFF/0000 for True/False */
  2773. movq mm1, mm4 /* copy of the mm4 */
  2774. pand mm4, mm6 /* */
  2775. pandn mm1, mm3 /* */
  2776. por mm1, mm4 /* mm1 = p1 */
  2777. /* now find P2 */
  2778. movq mm4, [edi+128] /* mm4 = [3] */
  2779. movq mm5, [edi+144] /* mm5 = [4] */
  2780. movq mm3, mm4 /* copy of 3 */
  2781. movq mm6, mm5 /* copy of 4 */
  2782. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  2783. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  2784. por mm4, mm5 /* abs([3]-[4] ) */
  2785. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  2786. psraw mm4, 15 /* FFFF/0000 for True/False */
  2787. movq mm2, mm4 /* copy of the mm4 */
  2788. pand mm4, mm6 /* */
  2789. pandn mm2, mm3 /* */
  2790. por mm2, mm4 /* mm2 = p2 */
  2791. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  2792. /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
  2793. /* Des[-w4] = Src[-w4]; */
  2794. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  2795. movq mm3, mm1 /* mm3 = p1 */
  2796. paddw mm3, mm3 /* mm3 = p1 + p1 */
  2797. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  2798. movq mm4, [edi+16] /* mm4 = x1 */
  2799. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  2800. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  2801. paddw mm3, [edi+64] /* mm3 += x4 */
  2802. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  2803. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  2804. movq mm4, mm3 /* mm4 = mm3 */
  2805. movq mm5, [edi+16] /* mm5 = x1 */
  2806. paddw mm4, mm5 /* mm4 = sum+x1 */
  2807. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  2808. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  2809. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  2810. psraw mm4, 4 /* mm4 >>=4 */
  2811. psubw mm4, mm5 /* New Value - old Value */
  2812. pand mm4, mm7 /* And the flag */
  2813. paddw mm4, mm5 /* add the old value back */
  2814. movq [esi], mm4 /* Write new x1 */
  2815. /* sum += x5 -p1 */
  2816. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  2817. movq mm5, [edi+32] /* mm5= x2 */
  2818. psubw mm3, mm1 /* sum=sum-p1 */
  2819. paddw mm3, [edi+80] /* sum=sum+x5 */
  2820. movq mm4, mm5 /* copy sum */
  2821. paddw mm4, mm3 /* mm4=sum+x2 */
  2822. paddw mm4, mm4 /* mm4 <<= 1 */
  2823. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  2824. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  2825. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  2826. psubw mm4, mm5 /* new value - old value */
  2827. pand mm4, mm7 /* And the flag */
  2828. paddw mm4, mm5 /* add the old value back */
  2829. movq [esi+16], mm4 /* write new x2 */
  2830. /* sum += x6 - p1 */
  2831. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  2832. movq mm5, [edi+48] /* mm5= x3 */
  2833. psubw mm3, mm1 /* sum=sum-p1 */
  2834. paddw mm3, [edi+96] /* sum=sum+x6 */
  2835. movq mm4, mm5 /* copy x3 */
  2836. paddw mm4, mm3 /* mm4=sum+x3 */
  2837. paddw mm4, mm4 /* mm4 <<= 1 */
  2838. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  2839. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  2840. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  2841. psubw mm4, mm5 /* new value - old value */
  2842. pand mm4, mm7 /* And the flag */
  2843. paddw mm4, mm5 /* add the old value back */
  2844. movq [esi+32], mm4 /* write new x3 */
  2845. /* sum += x7 - p1 */
  2846. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  2847. movq mm5, [edi+64] /* mm5 = x4 */
  2848. psubw mm3, mm1 /* sum = sum-p1 */
  2849. paddw mm3, [edi+112] /* sum = sum+x7 */
  2850. movq mm4, mm5 /* mm4 = x4 */
  2851. paddw mm4, mm3 /* mm4 = sum + x4 */
  2852. paddw mm4, mm4 /* mm4 *=2 */
  2853. paddw mm4, mm1 /* += p1 */
  2854. psubw mm4, [edi+16] /* -= x1 */
  2855. psubw mm4, [edi+112] /* -= x7 */
  2856. paddw mm4, [edi+128] /* += x8 */
  2857. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  2858. psraw mm4, 4 /* >>=4 */
  2859. psubw mm4, mm5 /* -=x4 */
  2860. pand mm4, mm7 /* and flag */
  2861. paddw mm4, mm5 /* += x4 */
  2862. movq [esi+48], mm4 /* write new x4 */
  2863. /* sum+= x8-x1 */
  2864. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  2865. movq mm5, [edi+80] /* mm5 = x5 */
  2866. psubw mm3, [edi+16] /* sum -= x1 */
  2867. paddw mm3, [edi+128] /* sub += x8 */
  2868. movq mm4, mm5 /* mm4 = x5 */
  2869. paddw mm4, mm3 /* mm4= sum+x5 */
  2870. paddw mm4, mm4 /* mm4 *= 2 */
  2871. paddw mm4, [edi+16] /* += x1 */
  2872. psubw mm4, [edi+32] /* -= x2 */
  2873. psubw mm4, [edi+128] /* -= x8 */
  2874. paddw mm4, mm2 /* += p2 */
  2875. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  2876. psraw mm4, 4 /* >>=4 */
  2877. psubw mm4, mm5 /* -=x5 */
  2878. pand mm4, mm7 /* and flag */
  2879. paddw mm4, mm5 /* += x5 */
  2880. movq [esi+64], mm4 /* write new x5 */
  2881. /* sum += p2 - x2 */
  2882. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  2883. movq mm5, [edi+96] /* mm5 = x6 */
  2884. psubw mm3, [edi+32] /* -= x2 */
  2885. paddw mm3, mm2 /* += p2 */
  2886. movq mm4, mm5 /* mm4 = x6 */
  2887. paddw mm4, mm3 /* mm4 = sum+x6 */
  2888. paddw mm4, mm4 /* mm4 *= 2*/
  2889. paddw mm4, [edi+32] /* +=x2 */
  2890. psubw mm4, [edi+48] /* -=x3 */
  2891. psraw mm4, 4 /* >>=4 */
  2892. psubw mm4, mm5 /* -=x6 */
  2893. pand mm4, mm7 /* and flag */
  2894. paddw mm4, mm5 /* += x6 */
  2895. movq [esi+80], mm4 /* write new x6 */
  2896. /* sum += p2 - x3 */
  2897. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  2898. movq mm5, [edi+112] /* mm5 = x7 */
  2899. psubw mm3, [edi+48] /* -= x3 */
  2900. paddw mm3, mm2 /* += p2 */
  2901. movq mm4, mm5 /* mm4 = x7 */
  2902. paddw mm4, mm3 /* mm4 = sum+x7 */
  2903. paddw mm4, mm4 /* mm4 *= 2*/
  2904. paddw mm4, [edi+48] /* +=x3 */
  2905. psubw mm4, [edi+64] /* -=x4 */
  2906. psraw mm4, 4 /* >>=4 */
  2907. psubw mm4, mm5 /* -=x7 */
  2908. pand mm4, mm7 /* and flag */
  2909. paddw mm4, mm5 /* += x7 */
  2910. movq [esi+96], mm4 /* write new x7 */
  2911. /* sum += p2 - x4 */
  2912. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  2913. movq mm5, [edi+128] /* mm5 = x8 */
  2914. psubw mm3, [edi+64] /* -= x4 */
  2915. paddw mm3, mm2 /* += p2 */
  2916. movq mm4, mm5 /* mm4 = x8 */
  2917. paddw mm4, mm3 /* mm4 = sum+x8 */
  2918. paddw mm4, mm4 /* mm4 *= 2*/
  2919. paddw mm4, [edi+64] /* +=x4 */
  2920. psubw mm4, [edi+80] /* -=x5 */
  2921. psraw mm4, 4 /* >>=4 */
  2922. psubw mm4, mm5 /* -=x8 */
  2923. pand mm4, mm7 /* and flag */
  2924. paddw mm4, mm5 /* += x8 */
  2925. movq [esi+112], mm4 /* write new x8 */
  2926. /* done with left four columns */
  2927. /* now do the righ four columns */
  2928. add edi, 8 /* shift to right four column */
  2929. add esi, 8 /* shift to right four column */
  2930. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  2931. /* mm0 now are in use */
  2932. /* find the loop filtered values for the pixels on block boundary */
  2933. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  2934. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  2935. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  2936. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  2937. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  2938. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  2939. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  2940. movq mm4, mm5 /* make a copy */
  2941. paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
  2942. paddw mm3, FourFours /* mm3 + 4 */
  2943. paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
  2944. paddw mm3, mm5 /* Filtval before shift */
  2945. psraw mm3, 3 /* FiltVal */
  2946. movq mm2, mm3 /* make a copy */
  2947. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  2948. pxor mm2, mm3
  2949. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  2950. por mm3, FourOnes /* -1 and 1 for + and - */
  2951. movq mm4, mm1 /* make a copy of Flimit */
  2952. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  2953. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  2954. psraw mm1, 15 /* FFFF or 0000 */
  2955. pxor mm5, mm1
  2956. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  2957. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  2958. pmullw mm4, mm3 /* get the sign back */
  2959. movq mm1, [edi+64] /* p[-1] */
  2960. movq mm2, [edi+80] /* p[0] */
  2961. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  2962. psubw mm2, mm4 /* p[0] - NewFiltVal */
  2963. pxor mm6, mm6 /* clear mm6 */
  2964. packuswb mm1, mm1 /* clamping */
  2965. packuswb mm2, mm2 /* clamping */
  2966. punpcklbw mm1, mm6 /* unpack to word */
  2967. movq LoopFilteredValuesUp, mm1 /* save the values */
  2968. punpcklbw mm2, mm6 /* unpack to word */
  2969. movq LoopFilteredValuesDown, mm2 /* save the values */
  2970. /* Let's do the filtering now */
  2971. /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
  2972. /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
  2973. movq mm5, [edi] /* mm5 = -5 */
  2974. movq mm4, [edi + 16] /* mm4 = -4 */
  2975. movq mm3, mm4 /* copy of -4 */
  2976. movq mm6, mm5 /* copy of -5 */
  2977. psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
  2978. psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
  2979. por mm4, mm5 /* abs([-4]-[-5] ) */
  2980. psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
  2981. psraw mm4, 15 /* FFFF/0000 for True/False */
  2982. movq mm1, mm4 /* copy of the mm4 */
  2983. pand mm4, mm6 /* */
  2984. pandn mm1, mm3 /* */
  2985. por mm1, mm4 /* mm1 = p1 */
  2986. /* now find P2 */
  2987. movq mm4, [edi+128] /* mm4 = [3] */
  2988. movq mm5, [edi+144] /* mm5 = [4] */
  2989. movq mm3, mm4 /* copy of 3 */
  2990. movq mm6, mm5 /* copy of 4 */
  2991. psubusw mm4, mm6 /* mm4 = [3] - [4] */
  2992. psubusw mm5, mm3 /* mm5 = [4] - [3] */
  2993. por mm4, mm5 /* abs([3]-[4] ) */
  2994. psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
  2995. psraw mm4, 15 /* FFFF/0000 for True/False */
  2996. movq mm2, mm4 /* copy of the mm4 */
  2997. pand mm4, mm6 /* */
  2998. pandn mm2, mm3 /* */
  2999. por mm2, mm4 /* mm2 = p2 */
  3000. /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
  3001. /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
  3002. /* Des[-w4]=Src[-w4]; */
  3003. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  3004. movq mm3, mm1 /* mm3 = p1 */
  3005. paddw mm3, mm3 /* mm3 = p1 + p1 */
  3006. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  3007. movq mm4, [edi+16] /* mm4 = x1 */
  3008. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  3009. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  3010. paddw mm3, [edi+64] /* mm3 += x4 */
  3011. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  3012. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  3013. movq mm4, mm3 /* mm4 = mm3 */
  3014. movq mm5, [edi+16] /* mm5 = x1 */
  3015. paddw mm4, mm5 /* mm4 = sum+x1 */
  3016. psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
  3017. psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
  3018. paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
  3019. psraw mm4, 4 /* mm4 >>=4 */
  3020. psubw mm4, mm5 /* New Value - old Value */
  3021. pand mm4, mm0 /* And the flag */
  3022. paddw mm4, mm5 /* add the old value back */
  3023. movq [esi], mm4 /* Write new x1 */
  3024. /* sum += x5 -p1 */
  3025. /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
  3026. movq mm5, [edi+32] /* mm5= x2 */
  3027. psubw mm3, mm1 /* sum=sum-p1 */
  3028. paddw mm3, [edi+80] /* sum=sum+x5 */
  3029. movq mm4, mm5 /* copy sum */
  3030. paddw mm4, mm3 /* mm4=sum+x2 */
  3031. paddw mm4, mm4 /* mm4 <<= 1 */
  3032. psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
  3033. paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
  3034. psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  3035. psubw mm4, mm5 /* new value - old value */
  3036. pand mm4, mm0 /* And the flag */
  3037. paddw mm4, mm5 /* add the old value back */
  3038. movq [esi+16], mm4 /* write new x2 */
  3039. /* sum += x6 - p1 */
  3040. /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
  3041. movq mm5, [edi+48] /* mm5= x3 */
  3042. psubw mm3, mm1 /* sum=sum-p1 */
  3043. paddw mm3, [edi+96] /* sum=sum+x6 */
  3044. movq mm4, mm5 /* copy x3 */
  3045. paddw mm4, mm3 /* mm4=sum+x3 */
  3046. paddw mm4, mm4 /* mm4 <<= 1 */
  3047. psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
  3048. paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
  3049. psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  3050. psubw mm4, mm5 /* new value - old value */
  3051. pand mm4, mm0 /* And the flag */
  3052. paddw mm4, mm5 /* add the old value back */
  3053. movq [esi+32], mm4 /* write new x3 */
  3054. /* sum += x7 - p1 */
  3055. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  3056. movq mm5, [edi+64] /* mm5 = x4 */
  3057. psubw mm3, mm1 /* sum = sum-p1 */
  3058. paddw mm3, [edi+112] /* sum = sum+x7 */
  3059. movq mm4, mm5 /* mm4 = x4 */
  3060. paddw mm4, mm3 /* mm4 = sum + x4 */
  3061. paddw mm4, mm4 /* mm4 *=2 */
  3062. paddw mm4, mm1 /* += p1 */
  3063. psubw mm4, [edi+16] /* -= x1 */
  3064. psubw mm4, [edi+112] /* -= x7 */
  3065. paddw mm4, [edi+128] /* += x8 */
  3066. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  3067. psraw mm4, 4 /* >>=4 */
  3068. psubw mm4, mm5 /* -=x4 */
  3069. pand mm4, mm0 /* and flag */
  3070. paddw mm4, mm5 /* += x4 */
  3071. movq [esi+48], mm4 /* write new x4 */
  3072. /* sum+= x8-x1 */
  3073. /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
  3074. movq mm5, [edi+80] /* mm5 = x5 */
  3075. psubw mm3, [edi+16] /* sum -= x1 */
  3076. paddw mm3, [edi+128] /* sub += x8 */
  3077. movq mm4, mm5 /* mm4 = x5 */
  3078. paddw mm4, mm3 /* mm4= sum+x5 */
  3079. paddw mm4, mm4 /* mm4 *= 2 */
  3080. paddw mm4, [edi+16] /* += x1 */
  3081. psubw mm4, [edi+32] /* -= x2 */
  3082. psubw mm4, [edi+128] /* -= x8 */
  3083. paddw mm4, mm2 /* += p2 */
  3084. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x5 */
  3085. psraw mm4, 4 /* >>=4 */
  3086. psubw mm4, mm5 /* -=x5 */
  3087. pand mm4, mm0 /* and flag */
  3088. paddw mm4, mm5 /* += x5 */
  3089. movq [esi+64], mm4 /* write new x5 */
  3090. /* sum += p2 - x2 */
  3091. /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
  3092. movq mm5, [edi+96] /* mm5 = x6 */
  3093. psubw mm3, [edi+32] /* -= x2 */
  3094. paddw mm3, mm2 /* += p2 */
  3095. movq mm4, mm5 /* mm4 = x6 */
  3096. paddw mm4, mm3 /* mm4 = sum+x6 */
  3097. paddw mm4, mm4 /* mm4 *= 2*/
  3098. paddw mm4, [edi+32] /* +=x2 */
  3099. psubw mm4, [edi+48] /* -=x3 */
  3100. psraw mm4, 4 /* >>=4 */
  3101. psubw mm4, mm5 /* -=x6 */
  3102. pand mm4, mm0 /* and flag */
  3103. paddw mm4, mm5 /* += x6 */
  3104. movq [esi+80], mm4 /* write new x6 */
  3105. /* sum += p2 - x3 */
  3106. /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
  3107. movq mm5, [edi+112] /* mm5 = x7 */
  3108. psubw mm3, [edi+48] /* -= x3 */
  3109. paddw mm3, mm2 /* += p2 */
  3110. movq mm4, mm5 /* mm4 = x7 */
  3111. paddw mm4, mm3 /* mm4 = sum+x7 */
  3112. paddw mm4, mm4 /* mm4 *= 2*/
  3113. paddw mm4, [edi+48] /* +=x3 */
  3114. psubw mm4, [edi+64] /* -=x4 */
  3115. psraw mm4, 4 /* >>=4 */
  3116. psubw mm4, mm5 /* -=x7 */
  3117. pand mm4, mm0 /* and flag */
  3118. paddw mm4, mm5 /* += x7 */
  3119. movq [esi+96], mm4 /* write new x7 */
  3120. /* sum += p2 - x4 */
  3121. /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
  3122. movq mm5, [edi+128] /* mm5 = x8 */
  3123. psubw mm3, [edi+64] /* -= x4 */
  3124. paddw mm3, mm2 /* += p2 */
  3125. movq mm4, mm5 /* mm4 = x8 */
  3126. paddw mm4, mm3 /* mm4 = sum+x8 */
  3127. paddw mm4, mm4 /* mm4 *= 2*/
  3128. paddw mm4, [edi+64] /* +=x4 */
  3129. psubw mm4, [edi+80] /* -=x5 */
  3130. psraw mm4, 4 /* >>=4 */
  3131. psubw mm4, mm5 /* -=x8 */
  3132. pand mm4, mm0 /* and flag */
  3133. paddw mm4, mm5 /* += x8 */
  3134. movq [esi+112], mm4 /* write new x8 */
  3135. /* done with right four column */
  3136. /* transpose */
  3137. mov eax, Des /* the destination */
  3138. add edi, 8 /* shift edi to point x1 */
  3139. sub esi, 8 /* shift esi back to left x1 */
  3140. sub eax, 4
  3141. movq mm0, [esi] /* mm0 = 30 20 10 00 */
  3142. movq mm1, [esi+16] /* mm1 = 31 21 11 01 */
  3143. movq mm4, mm0 /* mm4 = 30 20 10 00 */
  3144. punpcklwd mm0, mm1 /* mm0 = 11 10 01 00 */
  3145. punpckhwd mm4, mm1 /* mm4 = 31 30 21 20 */
  3146. movq mm2, [esi+32] /* mm2 = 32 22 12 02 */
  3147. movq mm3, [esi+48] /* mm3 = 33 23 13 03 */
  3148. movq mm5, mm2 /* mm5 = 32 22 12 02 */
  3149. punpcklwd mm2, mm3 /* mm2 = 13 12 03 02 */
  3150. punpckhwd mm5, mm3 /* mm5 = 33 32 23 22 */
  3151. movq mm1, mm0 /* mm1 = 11 10 01 00 */
  3152. punpckldq mm0, mm2 /* mm0 = 03 02 01 00 */
  3153. movq [edi], mm0 /* write 00 01 02 03 */
  3154. punpckhdq mm1, mm2 /* mm1 = 13 12 11 10 */
  3155. movq mm0, mm4 /* mm0 = 31 30 21 20 */
  3156. movq [edi+16], mm1 /* write 10 11 12 13 */
  3157. punpckldq mm0, mm5 /* mm0 = 23 22 21 20 */
  3158. punpckhdq mm4, mm5 /* mm4 = 33 32 31 30 */
  3159. movq mm1, [esi+64] /* mm1 = 34 24 14 04 */
  3160. movq mm2, [esi+80] /* mm2 = 35 25 15 05 */
  3161. movq mm5, [esi+96] /* mm5 = 36 26 16 06 */
  3162. movq mm6, [esi+112] /* mm6 = 37 27 17 07 */
  3163. movq mm3, mm1 /* mm3 = 34 24 14 04 */
  3164. movq mm7, mm5 /* mm7 = 36 26 16 06 */
  3165. punpcklwd mm1, mm2 /* mm1 = 15 14 05 04 */
  3166. punpckhwd mm3, mm2 /* mm3 = 35 34 25 24 */
  3167. punpcklwd mm5, mm6 /* mm5 = 17 16 07 06 */
  3168. punpckhwd mm7, mm6 /* mm7 = 37 36 27 26 */
  3169. movq mm2, mm1 /* mm2 = 15 14 05 04 */
  3170. movq mm6, mm3 /* mm6 = 35 34 25 24 */
  3171. punpckldq mm1, mm5 /* mm1 = 07 06 05 04 */
  3172. punpckhdq mm2, mm5 /* mm2 = 17 16 15 14 */
  3173. punpckldq mm3, mm7 /* mm3 = 27 26 25 24 */
  3174. punpckhdq mm6, mm7 /* mm6 = 37 36 35 34 */
  3175. movq mm5, [edi] /* mm5 = 03 02 01 00 */
  3176. packuswb mm5, mm1 /* mm5 = 07 06 05 04 03 02 01 00 */
  3177. movq [eax], mm5 /* write 00 01 02 03 04 05 06 07 */
  3178. movq mm7, [edi+16] /* mm7 = 13 12 11 10 */
  3179. packuswb mm7, mm2 /* mm7 = 17 16 15 14 13 12 11 10 */
  3180. movq [eax+ecx], mm7 /* write 10 11 12 13 14 15 16 17 */
  3181. packuswb mm0, mm3 /* mm0 = 27 26 25 24 23 22 21 20 */
  3182. packuswb mm4, mm6 /* mm4 = 37 36 35 34 33 32 31 30 */
  3183. movq [eax+ecx*2], mm0 /* write 20 21 22 23 24 25 26 27 */
  3184. lea eax, [eax+ecx*4] /* mov forward the desPtr */
  3185. movq [eax+edx], mm4 /* write 30 31 32 33 34 35 36 37 */
  3186. add edi, 8 /* move to right four column */
  3187. add esi, 8 /* move to right x1 */
  3188. movq mm0, [esi] /* mm0 = 70 60 50 40 */
  3189. movq mm1, [esi+16] /* mm1 = 71 61 51 41 */
  3190. movq mm4, mm0 /* mm4 = 70 60 50 40 */
  3191. punpcklwd mm0, mm1 /* mm0 = 51 50 41 40 */
  3192. punpckhwd mm4, mm1 /* mm4 = 71 70 61 60 */
  3193. movq mm2, [esi+32] /* mm2 = 72 62 52 42 */
  3194. movq mm3, [esi+48] /* mm3 = 73 63 53 43 */
  3195. movq mm5, mm2 /* mm5 = 72 62 52 42 */
  3196. punpcklwd mm2, mm3 /* mm2 = 53 52 43 42 */
  3197. punpckhwd mm5, mm3 /* mm5 = 73 72 63 62 */
  3198. movq mm1, mm0 /* mm1 = 51 50 41 40 */
  3199. punpckldq mm0, mm2 /* mm0 = 43 42 41 40 */
  3200. movq [edi], mm0 /* write 40 41 42 43 */
  3201. punpckhdq mm1, mm2 /* mm1 = 53 52 51 50 */
  3202. movq mm0, mm4 /* mm0 = 71 70 61 60 */
  3203. movq [edi+16], mm1 /* write 50 51 52 53 */
  3204. punpckldq mm0, mm5 /* mm0 = 63 62 61 60 */
  3205. punpckhdq mm4, mm5 /* mm4 = 73 72 71 70 */
  3206. movq mm1, [esi+64] /* mm1 = 74 64 54 44 */
  3207. movq mm2, [esi+80] /* mm2 = 75 65 55 45 */
  3208. movq mm5, [esi+96] /* mm5 = 76 66 56 46 */
  3209. movq mm6, [esi+112] /* mm6 = 77 67 57 47 */
  3210. movq mm3, mm1 /* mm3 = 74 64 54 44 */
  3211. movq mm7, mm5 /* mm7 = 76 66 56 46 */
  3212. punpcklwd mm1, mm2 /* mm1 = 55 54 45 44 */
  3213. punpckhwd mm3, mm2 /* mm3 = 75 74 65 64 */
  3214. punpcklwd mm5, mm6 /* mm5 = 57 56 47 46 */
  3215. punpckhwd mm7, mm6 /* mm7 = 77 76 67 66 */
  3216. movq mm2, mm1 /* mm2 = 55 54 45 44 */
  3217. movq mm6, mm3 /* mm6 = 75 74 65 64 */
  3218. punpckldq mm1, mm5 /* mm1 = 47 46 45 44 */
  3219. punpckhdq mm2, mm5 /* mm2 = 57 56 55 54 */
  3220. punpckldq mm3, mm7 /* mm3 = 67 66 65 64 */
  3221. punpckhdq mm6, mm7 /* mm6 = 77 76 75 74 */
  3222. movq mm5, [edi] /* mm5 = 43 42 41 40 */
  3223. packuswb mm5, mm1 /* mm5 = 47 46 45 44 43 42 41 40 */
  3224. movq [eax], mm5 /* write 40 41 42 43 44 45 46 47 */
  3225. movq mm7, [edi+16] /* mm7 = 53 52 51 50 */
  3226. packuswb mm7, mm2 /* mm7 = 57 56 55 54 53 52 51 50 */
  3227. movq [eax+ecx], mm7 /* write 50 51 52 53 54 55 56 57 */
  3228. packuswb mm0, mm3 /* mm0 = 67 66 65 64 63 62 61 60 */
  3229. packuswb mm4, mm6 /* mm4 = 77 76 75 74 73 72 71 70 */
  3230. movq [eax+ecx*2], mm0 /* write 60 61 62 63 64 65 66 67 */
  3231. lea eax, [eax+ecx*4] /* mov forward the desPtr */
  3232. movq [eax+edx], mm4 /* write 70 71 72 73 74 75 76 77 */
  3233. pop edi
  3234. pop esi
  3235. pop edx
  3236. pop ecx
  3237. pop ebp
  3238. pop eax
  3239. }//__asm
  3240. Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
  3241. Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
  3242. pbi->FragmentVariances[CurrentFrag-1] += Var1;
  3243. Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
  3244. Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
  3245. pbi->FragmentVariances[CurrentFrag] += Var2;
  3246. CurrentFrag ++;
  3247. }//else
  3248. }//while
  3249. }
  3250. /****************************************************************************
  3251. *
  3252. * ROUTINE : DeblockNonFilteredBandNewFilter_MMX(
  3253. *
  3254. * INPUTS : None
  3255. *
  3256. * OUTPUTS : None
  3257. *
  3258. * RETURNS : None
  3259. *
  3260. * FUNCTION : Filter both horizontal and vertical edge in a band
  3261. *
  3262. * SPECIAL NOTES : Using Sum of abs to determine where to apply the
  3263. * new 7 tap filter
  3264. *
  3265. * REFERENCE :
  3266. *
  3267. * ERRORS : None.
  3268. *
  3269. ****************************************************************************/
  3270. void DeblockNonFilteredBandNewFilter_MMX(
  3271. POSTPROC_INSTANCE *pbi,
  3272. UINT8 *SrcPtr,
  3273. UINT8 *DesPtr,
  3274. UINT32 PlaneLineStep,
  3275. UINT32 FragAcross,
  3276. UINT32 StartFrag,
  3277. UINT32 *QuantScale
  3278. )
  3279. {
  3280. UINT32 j;
  3281. UINT32 CurrentFrag=StartFrag;
  3282. UINT32 QStep;
  3283. UINT32 LoopFLimit;
  3284. UINT8 *Src, *Des;
  3285. #if defined(_WIN32_WCE)
  3286. #pragma pack(16)
  3287. short QStepMmx[4];
  3288. short FLimitMmx[4];
  3289. short LoopFLimitMmx[4];
  3290. short Rows[80];
  3291. short NewRows[64];
  3292. short LoopFilteredValuesUp[4];
  3293. short LoopFilteredValuesDown[4];
  3294. unsigned char Variance11[8];
  3295. unsigned char Variance21[8];
  3296. UINT32 Var1, Var2;
  3297. #pragma pack()
  3298. #else
  3299. __declspec(align(16)) short QStepMmx[4];
  3300. __declspec(align(16)) short FLimitMmx[4];
  3301. __declspec(align(16)) short LoopFLimitMmx[4];
  3302. __declspec(align(16)) short Rows[80];
  3303. __declspec(align(16)) short NewRows[64];
  3304. __declspec(align(16)) short LoopFilteredValuesUp[4];
  3305. __declspec(align(16)) short LoopFilteredValuesDown[4];
  3306. __declspec(align(16)) unsigned char Variance11[8];
  3307. __declspec(align(16)) unsigned char Variance21[8];
  3308. UINT32 Var1, Var2;
  3309. #endif
  3310. QStep = QuantScale[pbi->FrameQIndex];
  3311. QStepMmx[0] = (INT16)QStep;
  3312. QStepMmx[1] = (INT16)QStep;
  3313. QStepMmx[2] = (INT16)QStep;
  3314. QStepMmx[3] = (INT16)QStep;
  3315. LoopFLimit = DeblockLimitValuesV2[pbi->FrameQIndex];
  3316. LoopFLimitMmx[0] = (INT16)LoopFLimit;
  3317. LoopFLimitMmx[1] = (INT16)LoopFLimit;
  3318. LoopFLimitMmx[2] = (INT16)LoopFLimit;
  3319. LoopFLimitMmx[3] = (INT16)LoopFLimit;
  3320. while(CurrentFrag < StartFrag + FragAcross )
  3321. {
  3322. Src=SrcPtr+8*(CurrentFrag-StartFrag);
  3323. Des=DesPtr+8*(CurrentFrag-StartFrag);
  3324. __asm
  3325. {
  3326. push eax
  3327. push ebp
  3328. push ecx
  3329. push edx
  3330. push esi
  3331. push edi
  3332. /* Calculate the FLimit and store FLimit and QStep */
  3333. /* Copy the data to the intermediate buffer */
  3334. mov eax, QStep
  3335. xor edx, edx /* clear edx */
  3336. mov ecx, PlaneLineStep /* ecx = Pitch */
  3337. movd mm5, eax
  3338. mov eax, Src /* eax = Src */
  3339. punpcklwd mm5, mm5
  3340. lea esi, NewRows /* esi = NewRows */
  3341. punpckldq mm5, mm5
  3342. sub edx, ecx /* edx = - Pitch */
  3343. movq mm6, mm5 /* Q Q Q Q */
  3344. paddw mm6, mm5
  3345. paddw mm6, mm5 /* 3Q3Q3Q3Q */
  3346. packuswb mm5, mm5 /* QQQQQQQQ */
  3347. movq QStepMmx, mm5
  3348. psraw mm6, 2 /* F F F F */
  3349. packuswb mm6, mm6 /* FFFFFFFF */
  3350. lea edi, Rows /* edi = Rows */
  3351. pxor mm7, mm7 /* Clear mm7 */
  3352. psubb mm6, Eight128c /* Eight (F-128)s */
  3353. lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
  3354. movq mm0, [eax + edx] /* mm0 = Src[-5*Pitch] */
  3355. movq mm1, mm0 /* mm1 = mm0 */
  3356. punpcklbw mm0, mm7 /* Lower Four -5 */
  3357. movq mm4, mm1 /* mm4 = Src[-5*Pitch] */
  3358. movq [FLimitMmx], mm6 /* FFFF FFFF */
  3359. movq mm2, [eax] /* mm2 = Src[-4*Pitch] */
  3360. punpckhbw mm1, mm7 /* Higher Four -5 */
  3361. movq [edi], mm0 /* Write Lower Four of -5 */
  3362. movq mm5, mm2 /* mm5 = S_4 */
  3363. movq mm3, mm2 /* mm3 = S_4 */
  3364. movq [edi+8], mm1 /* Write Higher Four of -5 */
  3365. movq mm0, [eax + ecx] /* mm0 = Src[-3*Pitch] */
  3366. psubusb mm5, mm4 /* S_4 - S_5 */
  3367. psubusb mm4, mm2 /* S_5 - S_4 */
  3368. punpcklbw mm2, mm7 /* Lower Four -4 */
  3369. por mm4, mm5 /* abs(S_4-S_5) */
  3370. movq [edi+16], mm2 /* Write Lower -4 */
  3371. movq mm6, mm3 /* mm6 = S_4 */
  3372. punpckhbw mm3, mm7 /* higher Four -4 */
  3373. movq [edi+24], mm3 /* write hight -4 */
  3374. movq mm1, mm0 /* mm1 = S_3 */
  3375. punpcklbw mm0, mm7 /* lower four -3 */
  3376. movq [edi+32], mm0 /* write Lower -3 */
  3377. movq mm2, [eax + ecx *2] /* mm2 = Src[-2*Pitch] */
  3378. movq mm5, mm1 /* mm5 = S_3 */
  3379. psubusb mm5, mm6 /* S_3 - S_4 */
  3380. psubusb mm6, mm1 /* S_4 - S_3 */
  3381. por mm5, mm6 /* abs(S_4-S_3) */
  3382. movq mm6, mm1 /* mm6 = S_3 */
  3383. punpckhbw mm1, mm7 /* higher four -3 */
  3384. movq mm3, mm2 /* mm3 = S_2 */
  3385. movq [edi+40], mm1 /* write Higher -3 */
  3386. paddusb mm4, mm5 /* abs(S_5-S_4)+abs(S_4-S_3) */
  3387. movq mm5, mm2 /* mm5 = S_2 */
  3388. psubusb mm5, mm6 /* S_2 - S_3 */
  3389. psubusb mm6, mm2 /* S_3 - S_2 */
  3390. por mm5, mm6 /* abs(S_3 - S_2) */
  3391. movq mm6, mm2 /* mm6 = S_2 */
  3392. punpcklbw mm2, mm7 /* lower four -2 */
  3393. lea eax, [eax + ecx *4] /* eax = Src */
  3394. punpckhbw mm3, mm7 /* higher four -2 */
  3395. movq mm0, [eax + edx] /* mm2 = Src[-Pitch] */
  3396. movq [edi+48], mm2 /* lower -2 */
  3397. paddusb mm4, mm5 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2) */
  3398. movq mm5, mm0 /* mm5 = S_1 */
  3399. movq [edi+56], mm3 /* higher -2 */
  3400. movq mm1, mm0 /* mm1 = S_1 */
  3401. psubusb mm5, mm6 /* S_1 - S_2 */
  3402. psubusb mm6, mm1 /* S_2 - S_1 */
  3403. punpcklbw mm0, mm7 /* lower -1 */
  3404. por mm5, mm6 /* abs(S_2 - S_1) */
  3405. movq [edi+64], mm0 /* Lower -1 */
  3406. movq mm6, mm1 /* mm6 = S_1 */
  3407. punpckhbw mm1, mm7 /* Higher -1 */
  3408. movq [edi+72], mm1 /* Higher -1 */
  3409. movq mm0, [eax] /* mm0 = Src[0] */
  3410. paddusb mm4, mm5 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2 - S_1) */
  3411. movq [Variance11], mm4; /* save the variance */
  3412. movq mm5, FLimitMmx /* mm5 = FFFF FFFF */
  3413. psubb mm4, Eight128c /* abs(..) - 128 */
  3414. pcmpgtb mm5, mm4 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2 - S_1) < FLimit ? */
  3415. movq mm1, mm0 /* mm1 = S0 */
  3416. punpcklbw mm0, mm7 /* lower 0 */
  3417. movq mm4, mm1 /* mm4 = S0 */
  3418. movq [edi+80], mm0 /* write lower 0 */
  3419. psubusb mm4, mm6 /* S0 - S_1 */
  3420. psubusb mm6, mm1 /* S_1 - S0 */
  3421. movq mm0, [eax + ecx] /* mm0 = Src[Pitch] */
  3422. movq mm3, QStepMmx /* mm3 = QQQQQQQQQ */
  3423. por mm4, mm6 /* abs(S0 - S_1) */
  3424. movq mm6, mm1 /* mm6 = S0 */
  3425. psubb mm3, Eight128c /* -128 for using signed compare*/
  3426. psubb mm4, Eight128c /* -128 for using signed compare*/
  3427. pcmpgtb mm3, mm4 /* abs(S0-S_1) < QStep */
  3428. punpckhbw mm1, mm7 /* higher 0 */
  3429. movq mm4, mm0 /* mm4 = S1 */
  3430. pand mm5, mm3 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2 - S_1) < FLimit &&
  3431. abs(S0-S_1) < QStep */
  3432. movq [edi+88], mm1 /* write higher 0 */
  3433. movq mm1, mm0 /* mm1 = S1 */
  3434. psubusb mm4, mm6 /* S1 - S0 */
  3435. punpcklbw mm0, mm7 /* lower 1 */
  3436. psubusb mm6, mm1 /* S0 - S1 */
  3437. movq [edi+96], mm0 /* write lower 1 */
  3438. por mm4, mm6 /* mm4 = abs(S1-S0) */
  3439. movq mm2, [eax + ecx *2 ] /* mm2 = Src[2*Pitch] */
  3440. movq mm6, mm1 /* mm6 = S1 */
  3441. lea eax, [eax + ecx *4] /* eax = Src + 4 * Pitch */
  3442. punpckhbw mm1, mm7 /* higher 1 */
  3443. movq mm0, mm2 /* mm0 = S2 */
  3444. movq [edi+104], mm1 /* wirte higher 1 */
  3445. movq mm3, mm0 /* mm3 = S2 */
  3446. movq mm1, [eax + edx ] /* mm4 = Src[3*pitch] */
  3447. punpcklbw mm2, mm7 /* lower 2 */
  3448. psubusb mm3, mm6 /* S2 - S1 */
  3449. psubusb mm6, mm0 /* S1 - S2 */
  3450. por mm3, mm6 /* abs(S1-S2) */
  3451. movq [edi+112], mm2 /* write lower 2 */
  3452. movq mm6, mm0 /* mm6 = S2 */
  3453. punpckhbw mm0, mm7 /* higher 2 */
  3454. paddusb mm4, mm3 /* abs(S0-S1)+abs(S1-S2) */
  3455. movq mm2, mm1 /* mm2 = S3 */
  3456. movq mm3, mm1 /* mm3 = S3 */
  3457. movq [edi+120], mm0 /* write higher 2 */
  3458. punpcklbw mm1, mm7 /* Low 3 */
  3459. movq mm0, [eax] /* mm0 = Src[4*pitch] */
  3460. psubusb mm3, mm6 /* S3 - S2 */
  3461. psubusb mm6, mm2 /* S2 - S3 */
  3462. por mm3, mm6 /* abs(S2-S3) */
  3463. movq [edi+128], mm1 /* low 3 */
  3464. movq mm6, mm2 /* mm6 = S3 */
  3465. punpckhbw mm2, mm7 /* high 3 */
  3466. paddusb mm4, mm3 /* abs(S0-S1)+abs(S1-S2)+abs(S2-S3) */
  3467. movq mm1, mm0 /* mm1 = S4 */
  3468. movq mm3, mm0 /* mm3 = S4 */
  3469. movq [edi+136], mm2 /* high 3 */
  3470. punpcklbw mm0, mm7 /* low 4 */
  3471. psubusb mm3, mm6 /* S4 - S3 */
  3472. movq [edi+144], mm0 /* low 4 */
  3473. psubusb mm6, mm1 /* S3 - S4 */
  3474. por mm3, mm6 /* abs(S3-S4) */
  3475. punpckhbw mm1, mm7 /* high 4 */
  3476. paddusb mm4, mm3 /* abs((S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4) */
  3477. movq [Variance21], mm4; /* save the variance */
  3478. movq mm6, FLimitMmx /* mm6 = FFFFFFFFF */
  3479. psubb mm4, Eight128c /* abs(..) - 128 */
  3480. movq [edi+152], mm1 /* high 4 */
  3481. pcmpgtb mm6, mm4 /* abs((S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4)<FLimit? */
  3482. pand mm6, mm5 /* Flag */
  3483. /* done with copying everything to intermediate buffer */
  3484. /* mm7 = 0, mm6 = Flag */
  3485. movq mm0, mm6
  3486. movq mm7, mm6
  3487. punpckhbw mm0, mm6
  3488. punpcklbw mm7, mm6
  3489. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  3490. /* mm0 and mm7 now are in use */
  3491. /* find the loop filtered values for the pixels on block boundary */
  3492. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  3493. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  3494. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  3495. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  3496. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  3497. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  3498. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  3499. movq mm4, mm5 /* make a copy */
  3500. paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
  3501. paddw mm3, FourFours /* mm3 + 4 */
  3502. paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
  3503. paddw mm3, mm5 /* Filtval before shift */
  3504. psraw mm3, 3 /* FiltVal */
  3505. movq mm2, mm3 /* make a copy */
  3506. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  3507. pxor mm2, mm3
  3508. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  3509. por mm3, FourOnes /* -1 and 1 for + and - */
  3510. movq mm4, mm1 /* make a copy of Flimit */
  3511. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  3512. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  3513. psraw mm1, 15 /* FFFF or 0000 */
  3514. pxor mm5, mm1
  3515. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  3516. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  3517. pmullw mm4, mm3 /* get the sign back */
  3518. movq mm1, [edi+64] /* p[-1] */
  3519. movq mm2, [edi+80] /* p[0] */
  3520. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  3521. psubw mm2, mm4 /* p[0] - NewFiltVal */
  3522. pxor mm6, mm6 /* clear mm6 */
  3523. packuswb mm1, mm1 /* clamping */
  3524. packuswb mm2, mm2 /* clamping */
  3525. punpcklbw mm1, mm6 /* unpack to word */
  3526. movq LoopFilteredValuesUp, mm1 /* save the values */
  3527. punpcklbw mm2, mm6 /* unpack to word */
  3528. movq LoopFilteredValuesDown, mm2 /* save the values */
  3529. /* Let's do the filtering now */
  3530. /* p1 = Src[-5] */
  3531. /* p2 = Src[+4] */
  3532. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  3533. movq mm3, [edi] /* mm3 = [-5] */
  3534. movq mm2, [edi+144] /* mm2 = [4] */
  3535. movq mm1, mm3 /* p1 = [-4] */
  3536. paddw mm3, mm3 /* mm3 = p1 + p1 */
  3537. movq mm4, [edi+16] /* mm4 = x1 */
  3538. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  3539. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  3540. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  3541. paddw mm3, [edi+64] /* mm3 += x4 */
  3542. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  3543. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  3544. /* Des[-w4] = (((sum + x1) >> 3; */
  3545. /* Des[-w4] = Src[-w4]; */
  3546. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  3547. movq mm4, mm3 /* mm4 = mm3 */
  3548. movq mm5, [edi+16] /* mm5 = x1 */
  3549. paddw mm4, mm5 /* mm4 = sum+x1 */
  3550. psraw mm4, 3 /* mm4 >>=4 */
  3551. psubw mm4, mm5 /* New Value - old Value */
  3552. pand mm4, mm7 /* And the flag */
  3553. paddw mm4, mm5 /* add the old value back */
  3554. movq [esi], mm4 /* Write new x1 */
  3555. /* sum += x5 -p1 */
  3556. /* Des[-w3]=((sum+x2)>>3 */
  3557. movq mm5, [edi+32] /* mm5= x2 */
  3558. psubw mm3, mm1 /* sum=sum-p1 */
  3559. paddw mm3, [edi+80] /* sum=sum+x5 */
  3560. movq mm4, mm5 /* copy sum */
  3561. paddw mm4, mm3 /* mm4=sum+x2 */
  3562. psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  3563. psubw mm4, mm5 /* new value - old value */
  3564. pand mm4, mm7 /* And the flag */
  3565. paddw mm4, mm5 /* add the old value back */
  3566. movq [esi+16], mm4 /* write new x2 */
  3567. /* sum += x6 - p1 */
  3568. /* Des[-w2]=((sum+x[3])>>3 */
  3569. movq mm5, [edi+48] /* mm5= x3 */
  3570. psubw mm3, mm1 /* sum=sum-p1 */
  3571. paddw mm3, [edi+96] /* sum=sum+x6 */
  3572. movq mm4, mm5 /* copy x3 */
  3573. paddw mm4, mm3 /* mm4=sum+x3 */
  3574. psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  3575. psubw mm4, mm5 /* new value - old value */
  3576. pand mm4, mm7 /* And the flag */
  3577. paddw mm4, mm5 /* add the old value back */
  3578. movq [esi+32], mm4 /* write new x3 */
  3579. /* sum += x7 - p1 */
  3580. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  3581. movq mm5, [edi+64] /* mm5 = x4 */
  3582. psubw mm3, mm1 /* sum = sum-p1 */
  3583. paddw mm3, [edi+112] /* sum = sum+x7 */
  3584. movq mm4, mm5 /* mm4 = x4 */
  3585. paddw mm4, mm3 /* mm4 = sum + x4 */
  3586. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  3587. psraw mm4, 3 /* >>=4 */
  3588. psubw mm4, mm5 /* -=x4 */
  3589. pand mm4, mm7 /* and flag */
  3590. paddw mm4, mm5 /* += x4 */
  3591. movq [esi+48], mm4 /* write new x4 */
  3592. /* sum+= x8-x1 */
  3593. /* Des[0]=((sum+x5)>>3 */
  3594. movq mm5, [edi+80] /* mm5 = x5 */
  3595. psubw mm3, [edi+16] /* sum -= x1 */
  3596. paddw mm3, [edi+128] /* sub += x8 */
  3597. movq mm4, mm5 /* mm4 = x5 */
  3598. paddw mm4, mm3 /* mm4= sum+x5 */
  3599. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  3600. psraw mm4, 3 /* >>=4 */
  3601. psubw mm4, mm5 /* -=x5 */
  3602. pand mm4, mm7 /* and flag */
  3603. paddw mm4, mm5 /* += x5 */
  3604. movq [esi+64], mm4 /* write new x5 */
  3605. /* sum += p2 - x2 */
  3606. /* Des[w1] = ((sum+x6)>>3 */
  3607. movq mm5, [edi+96] /* mm5 = x6 */
  3608. psubw mm3, [edi+32] /* -= x2 */
  3609. paddw mm3, mm2 /* += p2 */
  3610. movq mm4, mm5 /* mm4 = x6 */
  3611. paddw mm4, mm3 /* mm4 = sum+x6 */
  3612. psraw mm4, 3 /* >>=3 */
  3613. psubw mm4, mm5 /* -=x6 */
  3614. pand mm4, mm7 /* and flag */
  3615. paddw mm4, mm5 /* += x6 */
  3616. movq [esi+80], mm4 /* write new x6 */
  3617. /* sum += p2 - x3 */
  3618. /* Des[w2] = (sum+x7)>>3 */
  3619. movq mm5, [edi+112] /* mm5 = x7 */
  3620. psubw mm3, [edi+48] /* -= x3 */
  3621. paddw mm3, mm2 /* += p2 */
  3622. movq mm4, mm5 /* mm4 = x7 */
  3623. paddw mm4, mm3 /* mm4 = sum+x7 */
  3624. psraw mm4, 3 /* >>=3 */
  3625. psubw mm4, mm5 /* -=x7 */
  3626. pand mm4, mm7 /* and flag */
  3627. paddw mm4, mm5 /* += x7 */
  3628. movq [esi+96], mm4 /* write new x7 */
  3629. /* sum += p2 - x4 */
  3630. /* Des[w3] = ((sum+x8)>>3 */
  3631. movq mm5, [edi+128] /* mm5 = x8 */
  3632. psubw mm3, [edi+64] /* -= x4 */
  3633. paddw mm3, mm2 /* += p2 */
  3634. movq mm4, mm5 /* mm4 = x8 */
  3635. paddw mm4, mm3 /* mm4 = sum+x8 */
  3636. psraw mm4, 3 /* >>=3 */
  3637. psubw mm4, mm5 /* -=x8 */
  3638. pand mm4, mm7 /* and flag */
  3639. paddw mm4, mm5 /* += x8 */
  3640. movq [esi+112], mm4 /* write new x8 */
  3641. /* done with left four columns */
  3642. /* now do the righ four columns */
  3643. add edi, 8 /* shift to right four column */
  3644. add esi, 8 /* shift to right four column */
  3645. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  3646. /* mm0 now are in use */
  3647. /* find the loop filtered values for the pixels on block boundary */
  3648. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  3649. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  3650. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  3651. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  3652. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  3653. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  3654. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  3655. movq mm4, mm5 /* make a copy */
  3656. paddw mm3, FourFours /* mm3 + 4 */
  3657. paddw mm4, mm4 /* 2 * ( p[0] - p[-1] ) */
  3658. paddw mm3, mm4 /* 3 * ( p[0] - p[-1] ) */
  3659. paddw mm3, mm5 /* Filtval before shift */
  3660. psraw mm3, 3 /* FiltVal */
  3661. movq mm2, mm3 /* make a copy */
  3662. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  3663. pxor mm2, mm3
  3664. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  3665. por mm3, FourOnes /* -1 and 1 for + and - */
  3666. movq mm4, mm1 /* make a copy of Flimit */
  3667. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  3668. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  3669. psraw mm1, 15 /* FFFF or 0000 */
  3670. pxor mm5, mm1
  3671. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  3672. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  3673. pmullw mm4, mm3 /* get the sign back */
  3674. movq mm1, [edi+64] /* p[-1] */
  3675. movq mm2, [edi+80] /* p[0] */
  3676. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  3677. psubw mm2, mm4 /* p[0] - NewFiltVal */
  3678. pxor mm6, mm6 /* clear mm6 */
  3679. packuswb mm1, mm1 /* clamping */
  3680. packuswb mm2, mm2 /* clamping */
  3681. punpcklbw mm1, mm6 /* unpack to word */
  3682. movq LoopFilteredValuesUp, mm1 /* save the values */
  3683. punpcklbw mm2, mm6 /* unpack to word */
  3684. movq LoopFilteredValuesDown, mm2 /* save the values */
  3685. /* Let's do the filtering now */
  3686. /* p1 = Src[-5] */
  3687. /* p2 = Src[+4] */
  3688. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  3689. movq mm3, [edi] /* mm3 = [-5] */
  3690. movq mm2, [edi+144] /* mm2 = [4] */
  3691. movq mm1, mm3 /* p1 = [-4] */
  3692. paddw mm3, mm3 /* mm3 = p1 + p1 */
  3693. movq mm4, [edi+16] /* mm4 = x1 */
  3694. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  3695. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  3696. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  3697. paddw mm3, [edi+64] /* mm3 += x4 */
  3698. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  3699. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  3700. /* Des[-w4] = (((sum + x1) >> 3; */
  3701. /* Des[-w4] = Src[-w4]; */
  3702. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  3703. movq mm4, mm3 /* mm4 = mm3 */
  3704. movq mm5, [edi+16] /* mm5 = x1 */
  3705. paddw mm4, mm5 /* mm4 = sum+x1 */
  3706. psraw mm4, 3 /* mm4 >>=4 */
  3707. psubw mm4, mm5 /* New Value - old Value */
  3708. pand mm4, mm0 /* And the flag */
  3709. paddw mm4, mm5 /* add the old value back */
  3710. movq [esi], mm4 /* Write new x1 */
  3711. /* sum += x5 -p1 */
  3712. /* Des[-w3]=((sum+x2)>>3 */
  3713. movq mm5, [edi+32] /* mm5= x2 */
  3714. psubw mm3, mm1 /* sum=sum-p1 */
  3715. paddw mm3, [edi+80] /* sum=sum+x5 */
  3716. movq mm4, mm5 /* copy sum */
  3717. paddw mm4, mm3 /* mm4=sum+x2 */
  3718. psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  3719. psubw mm4, mm5 /* new value - old value */
  3720. pand mm4, mm0 /* And the flag */
  3721. paddw mm4, mm5 /* add the old value back */
  3722. movq [esi+16], mm4 /* write new x2 */
  3723. /* sum += x6 - p1 */
  3724. /* Des[-w2]=((sum+x[3])>>3 */
  3725. movq mm5, [edi+48] /* mm5= x3 */
  3726. psubw mm3, mm1 /* sum=sum-p1 */
  3727. paddw mm3, [edi+96] /* sum=sum+x6 */
  3728. movq mm4, mm5 /* copy x3 */
  3729. paddw mm4, mm3 /* mm4=sum+x3 */
  3730. psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  3731. psubw mm4, mm5 /* new value - old value */
  3732. pand mm4, mm0 /* And the flag */
  3733. paddw mm4, mm5 /* add the old value back */
  3734. movq [esi+32], mm4 /* write new x3 */
  3735. /* sum += x7 - p1 */
  3736. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  3737. movq mm5, [edi+64] /* mm5 = x4 */
  3738. psubw mm3, mm1 /* sum = sum-p1 */
  3739. paddw mm3, [edi+112] /* sum = sum+x7 */
  3740. movq mm4, mm5 /* mm4 = x4 */
  3741. paddw mm4, mm3 /* mm4 = sum + x4 */
  3742. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  3743. psraw mm4, 3 /* >>=4 */
  3744. psubw mm4, mm5 /* -=x4 */
  3745. pand mm4, mm0 /* and flag */
  3746. paddw mm4, mm5 /* += x4 */
  3747. movq [esi+48], mm4 /* write new x4 */
  3748. /* sum+= x8-x1 */
  3749. /* Des[0]=((sum+x5)>>3 */
  3750. movq mm5, [edi+80] /* mm5 = x5 */
  3751. psubw mm3, [edi+16] /* sum -= x1 */
  3752. paddw mm3, [edi+128] /* sub += x8 */
  3753. movq mm4, mm5 /* mm4 = x5 */
  3754. paddw mm4, mm3 /* mm4= sum+x5 */
  3755. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  3756. psraw mm4, 3 /* >>=4 */
  3757. psubw mm4, mm5 /* -=x5 */
  3758. pand mm4, mm0 /* and flag */
  3759. paddw mm4, mm5 /* += x5 */
  3760. movq [esi+64], mm4 /* write new x5 */
  3761. /* sum += p2 - x2 */
  3762. /* Des[w1] = ((sum+x6)>>3 */
  3763. movq mm5, [edi+96] /* mm5 = x6 */
  3764. psubw mm3, [edi+32] /* -= x2 */
  3765. paddw mm3, mm2 /* += p2 */
  3766. movq mm4, mm5 /* mm4 = x6 */
  3767. paddw mm4, mm3 /* mm4 = sum+x6 */
  3768. psraw mm4, 3 /* >>=3 */
  3769. psubw mm4, mm5 /* -=x6 */
  3770. pand mm4, mm0 /* and flag */
  3771. paddw mm4, mm5 /* += x6 */
  3772. movq [esi+80], mm4 /* write new x6 */
  3773. /* sum += p2 - x3 */
  3774. /* Des[w2] = (sum+x7)>>3 */
  3775. movq mm5, [edi+112] /* mm5 = x7 */
  3776. psubw mm3, [edi+48] /* -= x3 */
  3777. paddw mm3, mm2 /* += p2 */
  3778. movq mm4, mm5 /* mm4 = x7 */
  3779. paddw mm4, mm3 /* mm4 = sum+x7 */
  3780. psraw mm4, 3 /* >>=3 */
  3781. psubw mm4, mm5 /* -=x7 */
  3782. pand mm4, mm0 /* and flag */
  3783. paddw mm4, mm5 /* += x7 */
  3784. movq [esi+96], mm4 /* write new x7 */
  3785. /* sum += p2 - x4 */
  3786. /* Des[w3] = ((sum+x8)>>3 */
  3787. movq mm5, [edi+128] /* mm5 = x8 */
  3788. psubw mm3, [edi+64] /* -= x4 */
  3789. paddw mm3, mm2 /* += p2 */
  3790. movq mm4, mm5 /* mm4 = x8 */
  3791. paddw mm4, mm3 /* mm4 = sum+x8 */
  3792. psraw mm4, 3 /* >>=3 */
  3793. psubw mm4, mm5 /* -=x8 */
  3794. pand mm4, mm0 /* and flag */
  3795. paddw mm4, mm5 /* += x8 */
  3796. movq [esi+112], mm4 /* write new x8 */
  3797. /* done with right four column */
  3798. add edi, 8 /* shift edi to point x1 */
  3799. sub esi, 8 /* shift esi back to x1 */
  3800. mov ebp, Des /* the destination */
  3801. lea ebp, [ebp + edx *4] /* point to des[-w4] */
  3802. movq mm0, [esi]
  3803. packuswb mm0, [esi + 8]
  3804. movq [ebp], mm0 /* write des[-w4] */
  3805. movq mm1, [esi + 16]
  3806. packuswb mm1, [esi + 24]
  3807. movq [ebp+ecx ], mm1 /* write des[-w3] */
  3808. movq mm2, [esi + 32]
  3809. packuswb mm2, [esi + 40]
  3810. movq [ebp+ecx*2 ], mm2 /* write des[-w2] */
  3811. movq mm3, [esi + 48]
  3812. packuswb mm3, [esi + 56]
  3813. lea ebp, [ebp+ecx*4] /* point to des[0] */
  3814. movq [ebp+edx], mm3 /* write des[-w1] */
  3815. movq mm0, [esi + 64]
  3816. packuswb mm0, [esi + 72]
  3817. movq [ebp ], mm0 /* write des[0] */
  3818. movq mm1, [esi + 80]
  3819. packuswb mm1, [esi + 88]
  3820. movq [ebp+ecx], mm1 /* write des[w1] */
  3821. movq mm2, [esi + 96]
  3822. packuswb mm2, [esi + 104]
  3823. movq [ebp+ecx*2], mm2 /* write des[w2] */
  3824. movq mm3, [esi + 112]
  3825. packuswb mm3, [esi + 120]
  3826. lea ebp, [ebp+ecx*2] /* point to des[w4] */
  3827. movq [ebp+ecx], mm3 /* write des[w3] */
  3828. pop edi
  3829. pop esi
  3830. pop edx
  3831. pop ecx
  3832. pop ebp
  3833. pop eax
  3834. } /* end of the macro */
  3835. Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
  3836. Var1 += Variance11[4]+ Variance11[5]+Variance11[6]+Variance11[7];
  3837. pbi->FragmentVariances[CurrentFrag] += Var1;
  3838. Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
  3839. Var2 += Variance21[4]+ Variance21[5]+Variance21[6]+Variance21[7];
  3840. pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
  3841. if(CurrentFrag==StartFrag)
  3842. CurrentFrag++;
  3843. else
  3844. {
  3845. Des=DesPtr-8*PlaneLineStep+8*(CurrentFrag-StartFrag);
  3846. Src=Des;
  3847. for( j=0; j<8;j++)
  3848. {
  3849. Rows[j] = (short) (Src[-5+j*PlaneLineStep]);
  3850. Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
  3851. }
  3852. __asm
  3853. {
  3854. /* Save the registers */
  3855. push eax
  3856. push ebp
  3857. /* Calculate the FLimit and store FLimit and QStep */
  3858. mov eax, QStep /* get QStep */
  3859. movd mm0, eax /* mm0 = 0, 0, 0, Q */
  3860. push ecx
  3861. punpcklwd mm0, mm0 /* mm0 = 0, 0, Q, Q */
  3862. punpckldq mm0, mm0 /* mm0 = Q, Q, Q, Q */
  3863. push edx
  3864. movq mm1, mm0 /* mm1 = Q, Q, Q, Q */
  3865. paddw mm1, mm0
  3866. push esi
  3867. paddw mm1, mm0
  3868. packuswb mm0, mm0
  3869. push edi
  3870. movq QStepMmx, mm0 /* write the Q step */
  3871. psraw mm1, 2 /* mm1 = FLimit */
  3872. packuswb mm1, mm1 /* mm1 = FFFF FFFF */
  3873. psubb mm1, Eight128c /* F-128 */
  3874. movq [FLimitMmx], mm1 /* Save FLimit */
  3875. /* setup the pointers to data */
  3876. mov eax, Src /* eax = Src */
  3877. xor edx, edx /* clear edx */
  3878. sub eax, 4 /* eax = Src-4 */
  3879. lea esi, NewRows /* esi = NewRows */
  3880. lea edi, Rows /* edi = Rows */
  3881. mov ecx, PlaneLineStep /* ecx = Pitch */
  3882. sub edx, ecx /* edx = -Pitch */
  3883. /* Get the data to the intermediate buffer */
  3884. movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
  3885. movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
  3886. movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
  3887. lea eax, [eax+ecx*4] /* Go down four Rows */
  3888. movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
  3889. movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
  3890. punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
  3891. punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
  3892. movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
  3893. punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
  3894. punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
  3895. movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
  3896. punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
  3897. punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
  3898. movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
  3899. punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
  3900. punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
  3901. pxor mm7, mm7 /* clear mm7 */
  3902. movq mm5, mm0 /* make a copy */
  3903. punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
  3904. movq [edi+16], mm0 /* write 00 10 20 30 */
  3905. punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
  3906. movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
  3907. movq [edi+32], mm5 /* write 01 11 21 31 */
  3908. punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
  3909. punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
  3910. movq [edi+48], mm1 /* write 02 12 22 32 */
  3911. movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
  3912. movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
  3913. movq [edi+64], mm0 /* write 03 13 23 33 */
  3914. punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
  3915. punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
  3916. movq [edi+80], mm2 /* write 04 14 24 34 */
  3917. punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
  3918. punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
  3919. movq [edi+96], mm3 /* write 05 15 25 35 */
  3920. movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
  3921. movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
  3922. movq [edi+112], mm4 /* write 06 16 26 37 */
  3923. movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
  3924. lea eax, [eax+ ecx*4] /* Go down four rows */
  3925. movq [edi+128], mm5 /* write 07 17 27 37 */
  3926. movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
  3927. movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
  3928. punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
  3929. punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
  3930. movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
  3931. punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
  3932. punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
  3933. movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
  3934. punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
  3935. punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
  3936. movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
  3937. punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
  3938. punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
  3939. movq mm5, mm0 /* make a copy */
  3940. punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
  3941. movq [edi+24], mm0 /* write 40 50 60 70 */
  3942. punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
  3943. movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
  3944. movq [edi+40], mm5 /* write 41 51 61 71 */
  3945. punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
  3946. punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
  3947. movq [edi+56], mm1 /* write 42 52 62 72 */
  3948. movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
  3949. movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
  3950. movq [edi+72], mm0 /* write 43 53 63 73 */
  3951. punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
  3952. punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
  3953. movq [edi+88], mm2 /* write 44 54 64 74 */
  3954. punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
  3955. punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
  3956. movq [edi+104], mm3 /* write 45 55 65 75 */
  3957. movq [edi+120], mm4 /* write 46 56 66 76 */
  3958. movq [edi+136], mm5 /* write 47 57 67 77 */
  3959. /* Now, compute the variances for Pixel 1-4 and 5-8 */
  3960. movq mm0, [edi] /* S_5 */
  3961. movq mm1, [edi+16] /* S_4 */
  3962. movq mm2, [edi+32] /* S_3 */
  3963. packuswb mm0, [edi+8]
  3964. packuswb mm1, [edi+24]
  3965. packuswb mm2, [edi+40]
  3966. movq mm3, [edi+48] /* S_2 */
  3967. movq mm4, [edi+64] /* S_1 */
  3968. packuswb mm3, [edi+56]
  3969. packuswb mm4, [edi+72]
  3970. movq mm5, mm1 /* S_4 */
  3971. movq mm6, mm2 /* S_3 */
  3972. psubusb mm5, mm0 /* S_4 - S_5 */
  3973. psubusb mm0, mm1 /* S_5 - S_4 */
  3974. por mm0, mm5 /* abs(S_5-S_4) */
  3975. psubusb mm6, mm1 /* S_3 - S_4 */
  3976. psubusb mm1, mm2 /* S_4 - S_3 */
  3977. movq mm5, mm3 /* S_2 */
  3978. por mm1, mm6 /* abs(S_4-S_3) */
  3979. psubusb mm5, mm2 /* S_2 - S_3 */
  3980. psubusb mm2, mm3 /* S_3 - S_2 */
  3981. movq mm6, mm4 /* S_1 */
  3982. por mm2, mm5 /* abs(S_3-S_2) */
  3983. psubusb mm6, mm3 /* S_1 - S_2 */
  3984. psubusb mm3, mm4 /* S_2 - S_1 */
  3985. por mm3, mm6 /* abs(S_2-S_1) */
  3986. paddusb mm0, mm1 /* abs(S_5-S_4)+abs(S_4-S_3) */
  3987. paddusb mm2, mm3 /* abs(S_3-S_2)+abs(S_2-S_1) */
  3988. movq mm7, FLimitMmx /* FFFFF FFFF */
  3989. paddusb mm0, mm2 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2-S_1) */
  3990. movq [Variance11], mm0 /* Save the variance */
  3991. movq mm6, mm4 /* S_1 */
  3992. psubb mm0, Eight128c /* abs(..) - 128 */
  3993. pcmpgtb mm7, mm0 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2-S_1)<? */
  3994. movq mm5, [edi+80] /* S0 */
  3995. movq mm1, [edi+96] /* S1 */
  3996. movq mm2, [edi+112] /* S2 */
  3997. packuswb mm5, [edi+88]
  3998. packuswb mm1, [edi+104]
  3999. packuswb mm2, [edi+120]
  4000. movq mm3, [edi+128] /* S3 */
  4001. movq mm4, [edi+144] /* S4 */
  4002. packuswb mm3, [edi+136]
  4003. packuswb mm4, [edi+152]
  4004. movq mm0, mm5 /* S0 */
  4005. psubusb mm5, mm6 /* S0-S_1 */
  4006. psubusb mm6, mm0 /* S_1-S0 */
  4007. por mm5, mm6 /* abs(S_1-S0) */
  4008. movq mm6, QStepMmx /* QQQQ QQQQ */
  4009. psubb mm5, Eight128c /* -128 for using signed compare*/
  4010. psubb mm6, Eight128c /* -128 for using signed compare*/
  4011. pcmpgtb mm6, mm5 /* abs(S_1-S0)<QStep? */
  4012. movq mm5, mm1 /* S1 */
  4013. pand mm7, mm6 /* abs(S_1-S0)<QStep &&
  4014. abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2-S_1)<FLimit? */
  4015. movq mm6, mm2 /* S2 */
  4016. psubusb mm5, mm0 /* S1 - S0 */
  4017. psubusb mm0, mm1 /* S0 - S1*/
  4018. por mm0, mm5 /* abs(S0-S1) */
  4019. psubusb mm6, mm1 /* S2 - S1 */
  4020. psubusb mm1, mm2 /* S1 - S2*/
  4021. movq mm5, mm3 /* S3 */
  4022. por mm1, mm6 /* abs(S1-S2) */
  4023. psubusb mm5, mm2 /* S3 - S2 */
  4024. psubusb mm2, mm3 /* S2 - S3 */
  4025. movq mm6, mm4 /* S4 */
  4026. por mm2, mm5 /* abs(S2-S3) */
  4027. psubusb mm6, mm3 /* S4 - S3 */
  4028. psubusb mm3, mm4 /* S3 - S4 */
  4029. por mm3, mm6 /* abs(S3-S4) */
  4030. paddusb mm0, mm1 /* abs(S0-S1)+abs(S1-S2) */
  4031. paddusb mm2, mm3 /* abs(S2-S3)+abs(S3-S4) */
  4032. movq mm6, FLimitMmx /* FFFFF FFFF */
  4033. paddusb mm0, mm2 /* abs(S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4) */
  4034. movq [Variance21], mm0 /* Save the variance */
  4035. psubb mm0, Eight128c /* abs(..) - 128 */
  4036. pcmpgtb mm6, mm0 /* abs(S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4)<FLimit */
  4037. pand mm6, mm7 /* Flag */
  4038. movq mm0, mm6
  4039. movq mm7, mm6
  4040. punpckhbw mm0, mm6
  4041. punpcklbw mm7, mm6
  4042. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  4043. /* mm0 and mm7 now are in use */
  4044. /* find the loop filtered values for the pixels on block boundary */
  4045. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  4046. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  4047. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  4048. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  4049. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  4050. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  4051. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  4052. movq mm4, mm5 /* make a copy */
  4053. paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
  4054. paddw mm3, FourFours /* mm3 + 4 */
  4055. paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
  4056. paddw mm3, mm5 /* Filtval before shift */
  4057. psraw mm3, 3 /* FiltVal */
  4058. movq mm2, mm3 /* make a copy */
  4059. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  4060. pxor mm2, mm3
  4061. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  4062. por mm3, FourOnes /* -1 and 1 for + and - */
  4063. movq mm4, mm1 /* make a copy of Flimit */
  4064. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  4065. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  4066. psraw mm1, 15 /* FFFF or 0000 */
  4067. pxor mm5, mm1
  4068. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  4069. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  4070. pmullw mm4, mm3 /* get the sign back */
  4071. movq mm1, [edi+64] /* p[-1] */
  4072. movq mm2, [edi+80] /* p[0] */
  4073. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  4074. psubw mm2, mm4 /* p[0] - NewFiltVal */
  4075. pxor mm6, mm6 /* clear mm6 */
  4076. packuswb mm1, mm1 /* clamping */
  4077. packuswb mm2, mm2 /* clamping */
  4078. punpcklbw mm1, mm6 /* unpack to word */
  4079. movq LoopFilteredValuesUp, mm1 /* save the values */
  4080. punpcklbw mm2, mm6 /* unpack to word */
  4081. movq LoopFilteredValuesDown, mm2 /* save the values */
  4082. /* Let's do the filtering now */
  4083. /* p1 = Src[-5] */
  4084. /* p2 = Src[+4] */
  4085. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  4086. movq mm3, [edi] /* mm3 = [-5] */
  4087. movq mm2, [edi+144] /* mm2 = [4] */
  4088. movq mm1, mm3 /* p1 = [-4] */
  4089. paddw mm3, mm3 /* mm3 = p1 + p1 */
  4090. movq mm4, [edi+16] /* mm4 = x1 */
  4091. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  4092. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  4093. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  4094. paddw mm3, [edi+64] /* mm3 += x4 */
  4095. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  4096. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  4097. /* Des[-w4] = (((sum + x1) >> 3; */
  4098. /* Des[-w4] = Src[-w4]; */
  4099. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  4100. movq mm4, mm3 /* mm4 = mm3 */
  4101. movq mm5, [edi+16] /* mm5 = x1 */
  4102. paddw mm4, mm5 /* mm4 = sum+x1 */
  4103. psraw mm4, 3 /* mm4 >>=3 */
  4104. psubw mm4, mm5 /* New Value - old Value */
  4105. pand mm4, mm7 /* And the flag */
  4106. paddw mm4, mm5 /* add the old value back */
  4107. movq [esi], mm4 /* Write new x1 */
  4108. /* sum += x5 -p1 */
  4109. /* Des[-w3]=((sum+x2)>>3 */
  4110. movq mm5, [edi+32] /* mm5= x2 */
  4111. psubw mm3, mm1 /* sum=sum-p1 */
  4112. paddw mm3, [edi+80] /* sum=sum+x5 */
  4113. movq mm4, mm5 /* copy sum */
  4114. paddw mm4, mm3 /* mm4=sum+x2 */
  4115. psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  4116. psubw mm4, mm5 /* new value - old value */
  4117. pand mm4, mm7 /* And the flag */
  4118. paddw mm4, mm5 /* add the old value back */
  4119. movq [esi+16], mm4 /* write new x2 */
  4120. /* sum += x6 - p1 */
  4121. /* Des[-w2]=((sum+x[3])>>3 */
  4122. movq mm5, [edi+48] /* mm5= x3 */
  4123. psubw mm3, mm1 /* sum=sum-p1 */
  4124. paddw mm3, [edi+96] /* sum=sum+x6 */
  4125. movq mm4, mm5 /* copy x3 */
  4126. paddw mm4, mm3 /* mm4=sum+x3 */
  4127. psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  4128. psubw mm4, mm5 /* new value - old value */
  4129. pand mm4, mm7 /* And the flag */
  4130. paddw mm4, mm5 /* add the old value back */
  4131. movq [esi+32], mm4 /* write new x3 */
  4132. /* sum += x7 - p1 */
  4133. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  4134. movq mm5, [edi+64] /* mm5 = x4 */
  4135. psubw mm3, mm1 /* sum = sum-p1 */
  4136. paddw mm3, [edi+112] /* sum = sum+x7 */
  4137. movq mm4, mm5 /* mm4 = x4 */
  4138. paddw mm4, mm3 /* mm4 = sum + x4 */
  4139. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  4140. psraw mm4, 3 /* >>=4 */
  4141. psubw mm4, mm5 /* -=x4 */
  4142. pand mm4, mm7 /* and flag */
  4143. paddw mm4, mm5 /* += x4 */
  4144. movq [esi+48], mm4 /* write new x4 */
  4145. /* sum+= x8-x1 */
  4146. /* Des[0]=((sum+x5)>>3 */
  4147. movq mm5, [edi+80] /* mm5 = x5 */
  4148. psubw mm3, [edi+16] /* sum -= x1 */
  4149. paddw mm3, [edi+128] /* sub += x8 */
  4150. movq mm4, mm5 /* mm4 = x5 */
  4151. paddw mm4, mm3 /* mm4= sum+x5 */
  4152. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  4153. psraw mm4, 3 /* >>=4 */
  4154. psubw mm4, mm5 /* -=x5 */
  4155. pand mm4, mm7 /* and flag */
  4156. paddw mm4, mm5 /* += x5 */
  4157. movq [esi+64], mm4 /* write new x5 */
  4158. /* sum += p2 - x2 */
  4159. /* Des[w1] = ((sum+x6)>>3 */
  4160. movq mm5, [edi+96] /* mm5 = x6 */
  4161. psubw mm3, [edi+32] /* -= x2 */
  4162. paddw mm3, mm2 /* += p2 */
  4163. movq mm4, mm5 /* mm4 = x6 */
  4164. paddw mm4, mm3 /* mm4 = sum+x6 */
  4165. psraw mm4, 3 /* >>=3 */
  4166. psubw mm4, mm5 /* -=x6 */
  4167. pand mm4, mm7 /* and flag */
  4168. paddw mm4, mm5 /* += x6 */
  4169. movq [esi+80], mm4 /* write new x6 */
  4170. /* sum += p2 - x3 */
  4171. /* Des[w2] = (sum+x7)>>3 */
  4172. movq mm5, [edi+112] /* mm5 = x7 */
  4173. psubw mm3, [edi+48] /* -= x3 */
  4174. paddw mm3, mm2 /* += p2 */
  4175. movq mm4, mm5 /* mm4 = x7 */
  4176. paddw mm4, mm3 /* mm4 = sum+x7 */
  4177. psraw mm4, 3 /* >>=3 */
  4178. psubw mm4, mm5 /* -=x7 */
  4179. pand mm4, mm7 /* and flag */
  4180. paddw mm4, mm5 /* += x7 */
  4181. movq [esi+96], mm4 /* write new x7 */
  4182. /* sum += p2 - x4 */
  4183. /* Des[w3] = ((sum+x8)>>3 */
  4184. movq mm5, [edi+128] /* mm5 = x8 */
  4185. psubw mm3, [edi+64] /* -= x4 */
  4186. paddw mm3, mm2 /* += p2 */
  4187. movq mm4, mm5 /* mm4 = x8 */
  4188. paddw mm4, mm3 /* mm4 = sum+x8 */
  4189. psraw mm4, 3 /* >>=3 */
  4190. psubw mm4, mm5 /* -=x8 */
  4191. pand mm4, mm7 /* and flag */
  4192. paddw mm4, mm5 /* += x8 */
  4193. movq [esi+112], mm4 /* write new x8 */
  4194. /* done with left four columns */
  4195. /* now do the righ four columns */
  4196. add edi, 8 /* shift to right four column */
  4197. add esi, 8 /* shift to right four column */
  4198. /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
  4199. /* mm0 now are in use */
  4200. /* find the loop filtered values for the pixels on block boundary */
  4201. movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
  4202. movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
  4203. movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
  4204. movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
  4205. movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
  4206. psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
  4207. psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
  4208. movq mm4, mm5 /* make a copy */
  4209. paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
  4210. paddw mm3, FourFours /* mm3 + 4 */
  4211. paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
  4212. paddw mm3, mm5 /* Filtval before shift */
  4213. psraw mm3, 3 /* FiltVal */
  4214. movq mm2, mm3 /* make a copy */
  4215. psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
  4216. pxor mm2, mm3
  4217. psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
  4218. por mm3, FourOnes /* -1 and 1 for + and - */
  4219. movq mm4, mm1 /* make a copy of Flimit */
  4220. psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
  4221. movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
  4222. psraw mm1, 15 /* FFFF or 0000 */
  4223. pxor mm5, mm1
  4224. psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
  4225. psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
  4226. pmullw mm4, mm3 /* get the sign back */
  4227. movq mm1, [edi+64] /* p[-1] */
  4228. movq mm2, [edi+80] /* p[0] */
  4229. paddw mm1, mm4 /* p[-1] + NewFiltVal */
  4230. psubw mm2, mm4 /* p[0] - NewFiltVal */
  4231. pxor mm6, mm6 /* clear mm6 */
  4232. packuswb mm1, mm1 /* clamping */
  4233. packuswb mm2, mm2 /* clamping */
  4234. punpcklbw mm1, mm6 /* unpack to word */
  4235. movq LoopFilteredValuesUp, mm1 /* save the values */
  4236. punpcklbw mm2, mm6 /* unpack to word */
  4237. movq LoopFilteredValuesDown, mm2 /* save the values */
  4238. /* Let's do the filtering now */
  4239. /* p1 = Src[-5] */
  4240. /* p2 = Src[+4] */
  4241. /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
  4242. movq mm3, [edi] /* mm3 = [-5] */
  4243. movq mm2, [edi+144] /* mm2 = [4] */
  4244. movq mm1, mm3 /* p1 = [-4] */
  4245. paddw mm3, mm3 /* mm3 = p1 + p1 */
  4246. movq mm4, [edi+16] /* mm4 = x1 */
  4247. paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
  4248. paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
  4249. paddw mm4, [edi+48] /* mm4 = x1+x3 */
  4250. paddw mm3, [edi+64] /* mm3 += x4 */
  4251. paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
  4252. paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
  4253. /* Des[-w4] = (((sum + x1) >> 3; */
  4254. /* Des[-w4] = Src[-w4]; */
  4255. /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
  4256. movq mm4, mm3 /* mm4 = mm3 */
  4257. movq mm5, [edi+16] /* mm5 = x1 */
  4258. paddw mm4, mm5 /* mm4 = sum+x1 */
  4259. psraw mm4, 3 /* mm4 >>=4 */
  4260. psubw mm4, mm5 /* New Value - old Value */
  4261. pand mm4, mm0 /* And the flag */
  4262. paddw mm4, mm5 /* add the old value back */
  4263. movq [esi], mm4 /* Write new x1 */
  4264. /* sum += x5 -p1 */
  4265. /* Des[-w3]=((sum+x2)>>3 */
  4266. movq mm5, [edi+32] /* mm5= x2 */
  4267. psubw mm3, mm1 /* sum=sum-p1 */
  4268. paddw mm3, [edi+80] /* sum=sum+x5 */
  4269. movq mm4, mm5 /* copy sum */
  4270. paddw mm4, mm3 /* mm4=sum+x2 */
  4271. psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
  4272. psubw mm4, mm5 /* new value - old value */
  4273. pand mm4, mm0 /* And the flag */
  4274. paddw mm4, mm5 /* add the old value back */
  4275. movq [esi+16], mm4 /* write new x2 */
  4276. /* sum += x6 - p1 */
  4277. /* Des[-w2]=((sum+x[3])>>3 */
  4278. movq mm5, [edi+48] /* mm5= x3 */
  4279. psubw mm3, mm1 /* sum=sum-p1 */
  4280. paddw mm3, [edi+96] /* sum=sum+x6 */
  4281. movq mm4, mm5 /* copy x3 */
  4282. paddw mm4, mm3 /* mm4=sum+x3 */
  4283. psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
  4284. psubw mm4, mm5 /* new value - old value */
  4285. pand mm4, mm0 /* And the flag */
  4286. paddw mm4, mm5 /* add the old value back */
  4287. movq [esi+32], mm4 /* write new x3 */
  4288. /* sum += x7 - p1 */
  4289. /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
  4290. movq mm5, [edi+64] /* mm5 = x4 */
  4291. psubw mm3, mm1 /* sum = sum-p1 */
  4292. paddw mm3, [edi+112] /* sum = sum+x7 */
  4293. movq mm4, mm5 /* mm4 = x4 */
  4294. paddw mm4, mm3 /* mm4 = sum + x4 */
  4295. movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
  4296. psraw mm4, 3 /* >>=4 */
  4297. psubw mm4, mm5 /* -=x4 */
  4298. pand mm4, mm0 /* and flag */
  4299. paddw mm4, mm5 /* += x4 */
  4300. movq [esi+48], mm4 /* write new x4 */
  4301. /* sum+= x8-x1 */
  4302. /* Des[0]=((sum+x5)>>3 */
  4303. movq mm5, [edi+80] /* mm5 = x5 */
  4304. psubw mm3, [edi+16] /* sum -= x1 */
  4305. paddw mm3, [edi+128] /* sub += x8 */
  4306. movq mm4, mm5 /* mm4 = x5 */
  4307. paddw mm4, mm3 /* mm4= sum+x5 */
  4308. movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
  4309. psraw mm4, 3 /* >>=4 */
  4310. psubw mm4, mm5 /* -=x5 */
  4311. pand mm4, mm0 /* and flag */
  4312. paddw mm4, mm5 /* += x5 */
  4313. movq [esi+64], mm4 /* write new x5 */
  4314. /* sum += p2 - x2 */
  4315. /* Des[w1] = ((sum+x6)>>3 */
  4316. movq mm5, [edi+96] /* mm5 = x6 */
  4317. psubw mm3, [edi+32] /* -= x2 */
  4318. paddw mm3, mm2 /* += p2 */
  4319. movq mm4, mm5 /* mm4 = x6 */
  4320. paddw mm4, mm3 /* mm4 = sum+x6 */
  4321. psraw mm4, 3 /* >>=3 */
  4322. psubw mm4, mm5 /* -=x6 */
  4323. pand mm4, mm0 /* and flag */
  4324. paddw mm4, mm5 /* += x6 */
  4325. movq [esi+80], mm4 /* write new x6 */
  4326. /* sum += p2 - x3 */
  4327. /* Des[w2] = (sum+x7)>>3 */
  4328. movq mm5, [edi+112] /* mm5 = x7 */
  4329. psubw mm3, [edi+48] /* -= x3 */
  4330. paddw mm3, mm2 /* += p2 */
  4331. movq mm4, mm5 /* mm4 = x7 */
  4332. paddw mm4, mm3 /* mm4 = sum+x7 */
  4333. psraw mm4, 3 /* >>=3 */
  4334. psubw mm4, mm5 /* -=x7 */
  4335. pand mm4, mm0 /* and flag */
  4336. paddw mm4, mm5 /* += x7 */
  4337. movq [esi+96], mm4 /* write new x7 */
  4338. /* sum += p2 - x4 */
  4339. /* Des[w3] = ((sum+x8)>>3 */
  4340. movq mm5, [edi+128] /* mm5 = x8 */
  4341. psubw mm3, [edi+64] /* -= x4 */
  4342. paddw mm3, mm2 /* += p2 */
  4343. movq mm4, mm5 /* mm4 = x8 */
  4344. paddw mm4, mm3 /* mm4 = sum+x8 */
  4345. psraw mm4, 3 /* >>=3 */
  4346. psubw mm4, mm5 /* -=x8 */
  4347. pand mm4, mm0 /* and flag */
  4348. paddw mm4, mm5 /* += x8 */
  4349. movq [esi+112], mm4 /* write new x8 */
  4350. /* done with right four column */
  4351. /* transpose */
  4352. mov eax, Des /* the destination */
  4353. add edi, 8 /* shift edi to point x1 */
  4354. sub esi, 8 /* shift esi back to left x1 */
  4355. sub eax, 4
  4356. movq mm0, [esi] /* mm0 = 30 20 10 00 */
  4357. movq mm1, [esi+16] /* mm1 = 31 21 11 01 */
  4358. movq mm4, mm0 /* mm4 = 30 20 10 00 */
  4359. punpcklwd mm0, mm1 /* mm0 = 11 10 01 00 */
  4360. punpckhwd mm4, mm1 /* mm4 = 31 30 21 20 */
  4361. movq mm2, [esi+32] /* mm2 = 32 22 12 02 */
  4362. movq mm3, [esi+48] /* mm3 = 33 23 13 03 */
  4363. movq mm5, mm2 /* mm5 = 32 22 12 02 */
  4364. punpcklwd mm2, mm3 /* mm2 = 13 12 03 02 */
  4365. punpckhwd mm5, mm3 /* mm5 = 33 32 23 22 */
  4366. movq mm1, mm0 /* mm1 = 11 10 01 00 */
  4367. punpckldq mm0, mm2 /* mm0 = 03 02 01 00 */
  4368. movq [edi], mm0 /* write 00 01 02 03 */
  4369. punpckhdq mm1, mm2 /* mm1 = 13 12 11 10 */
  4370. movq mm0, mm4 /* mm0 = 31 30 21 20 */
  4371. movq [edi+16], mm1 /* write 10 11 12 13 */
  4372. punpckldq mm0, mm5 /* mm0 = 23 22 21 20 */
  4373. punpckhdq mm4, mm5 /* mm4 = 33 32 31 30 */
  4374. movq mm1, [esi+64] /* mm1 = 34 24 14 04 */
  4375. movq mm2, [esi+80] /* mm2 = 35 25 15 05 */
  4376. movq mm5, [esi+96] /* mm5 = 36 26 16 06 */
  4377. movq mm6, [esi+112] /* mm6 = 37 27 17 07 */
  4378. movq mm3, mm1 /* mm3 = 34 24 14 04 */
  4379. movq mm7, mm5 /* mm7 = 36 26 16 06 */
  4380. punpcklwd mm1, mm2 /* mm1 = 15 14 05 04 */
  4381. punpckhwd mm3, mm2 /* mm3 = 35 34 25 24 */
  4382. punpcklwd mm5, mm6 /* mm5 = 17 16 07 06 */
  4383. punpckhwd mm7, mm6 /* mm7 = 37 36 27 26 */
  4384. movq mm2, mm1 /* mm2 = 15 14 05 04 */
  4385. movq mm6, mm3 /* mm6 = 35 34 25 24 */
  4386. punpckldq mm1, mm5 /* mm1 = 07 06 05 04 */
  4387. punpckhdq mm2, mm5 /* mm2 = 17 16 15 14 */
  4388. punpckldq mm3, mm7 /* mm3 = 27 26 25 24 */
  4389. punpckhdq mm6, mm7 /* mm6 = 37 36 35 34 */
  4390. movq mm5, [edi] /* mm5 = 03 02 01 00 */
  4391. packuswb mm5, mm1 /* mm5 = 07 06 05 04 03 02 01 00 */
  4392. movq [eax], mm5 /* write 00 01 02 03 04 05 06 07 */
  4393. movq mm7, [edi+16] /* mm7 = 13 12 11 10 */
  4394. packuswb mm7, mm2 /* mm7 = 17 16 15 14 13 12 11 10 */
  4395. movq [eax+ecx], mm7 /* write 10 11 12 13 14 15 16 17 */
  4396. packuswb mm0, mm3 /* mm0 = 27 26 25 24 23 22 21 20 */
  4397. packuswb mm4, mm6 /* mm4 = 37 36 35 34 33 32 31 30 */
  4398. movq [eax+ecx*2], mm0 /* write 20 21 22 23 24 25 26 27 */
  4399. lea eax, [eax+ecx*4] /* mov forward the desPtr */
  4400. movq [eax+edx], mm4 /* write 30 31 32 33 34 35 36 37 */
  4401. add edi, 8 /* move to right four column */
  4402. add esi, 8 /* move to right x1 */
  4403. movq mm0, [esi] /* mm0 = 70 60 50 40 */
  4404. movq mm1, [esi+16] /* mm1 = 71 61 51 41 */
  4405. movq mm4, mm0 /* mm4 = 70 60 50 40 */
  4406. punpcklwd mm0, mm1 /* mm0 = 51 50 41 40 */
  4407. punpckhwd mm4, mm1 /* mm4 = 71 70 61 60 */
  4408. movq mm2, [esi+32] /* mm2 = 72 62 52 42 */
  4409. movq mm3, [esi+48] /* mm3 = 73 63 53 43 */
  4410. movq mm5, mm2 /* mm5 = 72 62 52 42 */
  4411. punpcklwd mm2, mm3 /* mm2 = 53 52 43 42 */
  4412. punpckhwd mm5, mm3 /* mm5 = 73 72 63 62 */
  4413. movq mm1, mm0 /* mm1 = 51 50 41 40 */
  4414. punpckldq mm0, mm2 /* mm0 = 43 42 41 40 */
  4415. movq [edi], mm0 /* write 40 41 42 43 */
  4416. punpckhdq mm1, mm2 /* mm1 = 53 52 51 50 */
  4417. movq mm0, mm4 /* mm0 = 71 70 61 60 */
  4418. movq [edi+16], mm1 /* write 50 51 52 53 */
  4419. punpckldq mm0, mm5 /* mm0 = 63 62 61 60 */
  4420. punpckhdq mm4, mm5 /* mm4 = 73 72 71 70 */
  4421. movq mm1, [esi+64] /* mm1 = 74 64 54 44 */
  4422. movq mm2, [esi+80] /* mm2 = 75 65 55 45 */
  4423. movq mm5, [esi+96] /* mm5 = 76 66 56 46 */
  4424. movq mm6, [esi+112] /* mm6 = 77 67 57 47 */
  4425. movq mm3, mm1 /* mm3 = 74 64 54 44 */
  4426. movq mm7, mm5 /* mm7 = 76 66 56 46 */
  4427. punpcklwd mm1, mm2 /* mm1 = 55 54 45 44 */
  4428. punpckhwd mm3, mm2 /* mm3 = 75 74 65 64 */
  4429. punpcklwd mm5, mm6 /* mm5 = 57 56 47 46 */
  4430. punpckhwd mm7, mm6 /* mm7 = 77 76 67 66 */
  4431. movq mm2, mm1 /* mm2 = 55 54 45 44 */
  4432. movq mm6, mm3 /* mm6 = 75 74 65 64 */
  4433. punpckldq mm1, mm5 /* mm1 = 47 46 45 44 */
  4434. punpckhdq mm2, mm5 /* mm2 = 57 56 55 54 */
  4435. punpckldq mm3, mm7 /* mm3 = 67 66 65 64 */
  4436. punpckhdq mm6, mm7 /* mm6 = 77 76 75 74 */
  4437. movq mm5, [edi] /* mm5 = 43 42 41 40 */
  4438. packuswb mm5, mm1 /* mm5 = 47 46 45 44 43 42 41 40 */
  4439. movq [eax], mm5 /* write 40 41 42 43 44 45 46 47 */
  4440. movq mm7, [edi+16] /* mm7 = 53 52 51 50 */
  4441. packuswb mm7, mm2 /* mm7 = 57 56 55 54 53 52 51 50 */
  4442. movq [eax+ecx], mm7 /* write 50 51 52 53 54 55 56 57 */
  4443. packuswb mm0, mm3 /* mm0 = 67 66 65 64 63 62 61 60 */
  4444. packuswb mm4, mm6 /* mm4 = 77 76 75 74 73 72 71 70 */
  4445. movq [eax+ecx*2], mm0 /* write 60 61 62 63 64 65 66 67 */
  4446. lea eax, [eax+ecx*4] /* mov forward the desPtr */
  4447. movq [eax+edx], mm4 /* write 70 71 72 73 74 75 76 77 */
  4448. pop edi
  4449. pop esi
  4450. pop edx
  4451. pop ecx
  4452. pop ebp
  4453. pop eax
  4454. }//__asm
  4455. Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
  4456. Var1 += Variance11[4]+ Variance11[5]+Variance11[6]+Variance11[7];
  4457. pbi->FragmentVariances[CurrentFrag-1] += Var1;
  4458. Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
  4459. Var2 += Variance21[4]+ Variance21[5]+Variance21[6]+Variance21[7];
  4460. pbi->FragmentVariances[CurrentFrag] += Var2;
  4461. CurrentFrag ++;
  4462. }//else
  4463. }//while
  4464. }
  4465. /****************************************************************************
  4466. *
  4467. * ROUTINE : PlaneAddNoise_mmx
  4468. *
  4469. * INPUTS : UINT8 *Start starting address of buffer to add gaussian
  4470. * noise to
  4471. * UINT32 Width width of plane
  4472. * UINT32 Height height of plane
  4473. * INT32 Pitch distance between subsequent lines of frame
  4474. * INT32 q quantizer used to determine amount of noise
  4475. * to add
  4476. *
  4477. * OUTPUTS : None.
  4478. *
  4479. * RETURNS : void.
  4480. *
  4481. * FUNCTION : adds gaussian noise to a plane of pixels
  4482. *
  4483. * SPECIAL NOTES : None.
  4484. *
  4485. ****************************************************************************/
  4486. void PlaneAddNoise_mmx( UINT8 *Start, UINT32 Width, UINT32 Height, INT32 Pitch, int q)
  4487. {
  4488. unsigned int i;
  4489. INT32 Pitch4 = Pitch * 4;
  4490. const int noiseAmount = 2;
  4491. const int noiseAdder = 2 * noiseAmount + 1;
  4492. #if defined(_WIN32_WCE)
  4493. #pragma pack(16)
  4494. unsigned char blackclamp[16];
  4495. unsigned char whiteclamp[16];
  4496. unsigned char bothclamp[16];
  4497. #pragma pack()
  4498. #else
  4499. __declspec(align(16)) unsigned char blackclamp[16];
  4500. __declspec(align(16)) unsigned char whiteclamp[16];
  4501. __declspec(align(16)) unsigned char bothclamp[16];
  4502. #endif
  4503. char CharDist[300];
  4504. char Rand[2048] =
  4505. {
  4506. -2,0,-2,2,0,0,-1,2,2,1,-2,2,1,0,-1,-2,-2,-1,-2,-2,2,0,-2,-2,-2,-1,0,0,1,1,-2,1,0,-1,-2,1,1,2,0,-1,2,1,2,2,0,-2,0,-1,2,-1,1,2,2,2,1,-1,-1,-1,2,-2,-1,-2,1,-2,-2,2,-1,-1,0,1,2,1,0,-1,1,0,0,2,1,-2,0,-1,1,1,0,-1,-2,-1,0,2,0,2,1,-1,-2,1,0,-2,1,0,-2,2,-2,2,1,-1,0,-2,2,1,-2,2,2,0,-2,-2,2,0,-2,0,1,0,-1,0,1,1,1,0,-2,-1,2,-2,0,1,0,-2,2,2,0,-1,0,-1,2,-1,0,-1,2,-1,1,0,-2,1,2,-1,0,2,-2,2,0,-2,0,-2,2,1,1,-2,2,-2,-2,1,-1,2,-1,-1,-2,1,2,1,1,1,-1,-2,-2,-2,2,2,-1,-2,0,-2,-2,0,1,1,0,-2,0,-1,1,-1,0,-1,0,0,1,-2,0,2,1,2,-2,-1,-2,2,0,2,-2,1,-2,0,2,-2,2,-1,-1,1,0,-1,1,1,0,0,0,1,2,2,1,1,0,-1,-2,1,0,2,-1,-2,1,1,0,-1,0,-2,1,1,1,1,2,-2,0,2,2,1,1,-2,1,2,-1,0,-1,-2,-2,2,2,1,-2,-1,-2,-2,1,2,0,0,0,-1,0,0,-2,-1,1,-1,2,2,2,1,-1,2,-2,-2,1,0,1,2,-2,2,1,-1,-2,0,-1,-1,2,0,1,-2,0,-1,0,1,0,-1,1,0,1,-1,-2,1,-2,1,2,0,1,0,-1,1,0,-1,2,1,-2,-1,-2,1,2,1,-2,-1,-2,1,-2,2,2,0,1,2,-2,-2,1,1,-1,-2,-2,1,-1,-1,-1,1,2,2,0,1,1,2,-2,1,0,-1,-2,2,-2,0,0,-1,0,-1,-1,-2,2,-2,-1,1,2,1,1,1,-1,2,-1,2,-1,-1,0,2,-2,-2,0,0,-2,-1,2,-1,-2,-2,2,-2,-2,-2,-1,2,-1,0,2,2,0,2,1,-1,-1,-2,0,2,-1,-1,0,-1,1,2,0,2,-2,2,1,1,0,-2,-1,-1,-2,0,-2,1,2,-2,2,1,1,2,0,1,-2,1,1,1,-2,2,1,1,-2,0,2,-2,-1,-2,2,1,-1,2,-1,1,-1,-2,-1,0,2,-2,2,0,-2,1,-2,2,1,2,-1,0,-2,1,-2,0,-1,2,-2,-1,-2,-1,-2,1,2,2,-2,1,1,1,2,0,2,1,-2,1,0,0,2,0,0,0,-1,-1,-1,-2,1,-2,-2,-1,0,-2,
  4507. -2,-2,1,0,1,1,0,1,-1,2,0,-2,2,2,-1,2,-2,2,0,0,1,1,-2,-1,-1,0,2,1,1,2,-1,-1,2,-1,-1,0,-1,1,1,1,1,-2,-1,-1,1,2,-1,0,-2,2,-1,0,1,0,1,-2,-2,-2,-2,-1,-1,1,-2,-1,-2,1,1,-2,1,1,1,0,-2,0,-2,2,0,2,1,0,1,1,-1,-1,-2,2,-2,-2,-1,1,-1,-1,0,-2,0,0,1,1,0,-1,2,2,1,2,-2,0,2,-1,-1,-1,-2,1,-1,-2,-2,0,2,2,0,1,1,2,2,0,0,-2,1,0,0,0,0,2,1,-1,-2,-1,-1,-1,1,-1,2,-2,1,1,2,-2,0,2,1,2,-2,2,1,2,2,2,1,-2,1,-1,-1,1,1,-2,1,0,-2,2,2,-2,-1,0,0,1,-2,1,2,-2,1,1,-2,-2,-1,1,2,0,-1,1,-1,1,-1,-1,2,-1,-2,1,-2,-2,-2,-1,1,-1,0,0,-2,0,1,-1,1,2,0,0,-2,0,-1,0,2,0,-2,0,1,1,2,2,-1,2,1,1,2,1,2,2,2,0,0,-2,-1,2,0,-2,-2,1,1,-2,-2,-1,1,2,-2,-2,-2,-1,-2,2,1,-2,2,1,0,-2,-1,-1,1,1,-2,2,-2,1,0,2,0,-1,-1,1,-1,0,1,-2,2,1,-2,0,1,2,1,1,1,2,1,-1,0,-1,0,1,-1,0,0,2,1,1,1,0,1,1,2,-1,1,2,0,2,0,0,0,2,2,-2,-1,-1,1,2,1,-2,1,-2,0,0,0,-2,2,-2,1,-2,-2,1,-1,-1,1,0,0,-1,1,-2,0,0,2,0,-2,-1,-1,-2,2,1,2,1,1,0,1,1,2,0,-1,-2,2,2,0,-2,2,1,-2,0,2,-2,-2,-1,-2,0,-2,1,0,1,1,2,1,-1,2,-1,2,1,-1,-2,-1,-2,0,-2,2,-2,-1,-1,-2,-2,-2,1,1,2,-2,0,0,2,0,0,1,-1,0,-2,2,2,2,-2,0,1,1,1,-1,2,1,-2,0,-2,0,1,1,-2,1,0,2,2,1,-1,-1,0,-2,1,-2,1,1,-1,-2,-2,1,-2,-1,1,1,0,2,1,-1,0,2,-2,-2,-2,-2,2,-1,-1,2,-2,2,-1,2,-1,-1,-1,-1,2,2,2,2,1,-2,-2,-2,-1,0,-2,2,1,0,2,0,1,2,2,2,2,-2,-1,-1,-2,2,1,1,-2,1,2,1,2,-2,1,-1,1,2,2,-2,1,0,-2,-1,0,-2,2,0,-1,1,2,-1,-2,1,-1,0,2,2,-1,0,2,2,1,
  4508. -1,2,-1,-1,-2,0,-1,-2,-1,2,-1,2,-2,2,2,0,-1,1,0,1,0,-2,2,-2,-1,-1,1,0,2,1,1,0,2,1,-2,0,-2,-2,1,-1,2,0,1,-2,1,-2,1,2,0,1,-1,2,1,0,-1,2,0,1,-1,-2,0,1,0,-1,-2,-1,0,2,0,2,-1,0,-2,2,2,0,1,-1,1,0,0,-2,-1,-1,2,2,2,1,0,-2,0,-1,0,-2,2,-1,1,2,0,-1,-1,0,2,-1,-1,1,2,-1,-2,0,2,0,-2,2,-2,1,-1,-2,-2,-1,0,2,-2,-2,-1,-1,0,0,0,2,1,-1,0,0,2,0,2,1,2,0,2,-1,2,-1,2,1,-2,1,0,-2,-2,-2,0,2,-2,-2,-1,2,1,1,1,-1,1,2,2,-1,0,-2,-2,-2,-1,1,0,-2,-1,-2,1,-2,-2,0,-1,2,-2,2,-2,-2,-2,2,-1,0,-1,0,1,2,2,2,-2,-2,0,2,2,-2,2,2,-1,0,1,0,-1,2,2,1,0,-1,-2,-2,1,0,-1,-1,0,1,2,1,2,-1,0,-1,2,0,-1,0,0,-1,-1,-2,-1,-1,2,1,2,1,1,-1,1,-2,1,2,-1,-2,0,-2,2,1,0,1,0,1,1,1,1,2,-2,0,1,-2,0,-2,0,-1,-2,-1,2,0,1,-2,-1,2,2,-1,-1,-1,-2,2,-2,-2,-1,-1,1,1,-2,-1,-2,-1,0,-2,1,-2,0,1,-1,-2,-1,1,2,0,2,-2,1,2,1,1,0,0,-2,2,-1,-2,-1,-1,0,1,-1,2,-1,1,-1,-2,1,-1,-1,1,2,-1,2,-1,2,1,-1,-1,-1,0,-1,-1,-2,-2,1,2,1,2,-2,0,1,2,-1,1,1,2,2,2,1,-1,1,-2,0,1,-1,2,-2,0,-2,1,-1,-2,-1,-2,2,1,-2,0,-2,2,-2,0,2,0,2,0,0,0,1,2,2,-1,-2,1,-2,1,0,2,1,-1,0,-1,1,2,-2,-2,-1,-1,-1,2,2,-1,-2,0,0,2,0,-1,0,-1,0,2,-1,-1,2,0,0,1,1,-2,-2,-1,-2,-1,0,1,-1,-2,1,-2,-1,2,0,2,-1,-2,0,-1,-2,0,1,-2,2,-1,2,0,-1,-1,0,-1,0,1,2,-1,0,1,1,-2,-2,1,2,1,-1,0,-2,0,-2,-1,2,-1,-1,-2,-1,-2,-1,-1,-2,-1,-2,0,2,2,0,2,-2,0,0,1,-1,2,-1,-1,2,2,1,1,-2,-1,-1,2,2,0,1,-1,2,0,-2,2,-2,-1,-1,1,0,0,-2,
  4509. 2,-2,-2,2,0,1,-2,-2,0,1,0,2,2,-1,0,2,-2,2,0,-1,-2,-1,-2,-2,-2,2,0,1,-1,1,1,2,2,2,-1,-2,-2,2,-2,2,-1,2,-1,-1,1,2,-1,0,1,-1,0,0,2,1,1,0,2,0,-1,-1,-2,2,1,-1,-1,-1,-1,-2,2,-1,0,-2,2,1,1,-2,0,1,0,1,2,-2,-1,2,1,-2,2,-2,1,-2,-2,-2,0,0,0,-1,-2,-1,-2,0,-2,-1
  4510. };
  4511. double sigma;
  4512. __asm emms
  4513. sigma = 1 + .8*(63-q) / 63.0;
  4514. // set up a lookup table of 256 entries that matches
  4515. // a gaussian distribution with sigma determined by q.
  4516. //
  4517. {
  4518. double i,sum=0;
  4519. int next,j;
  4520. next=0;
  4521. for(i=-32;i<32;i++)
  4522. {
  4523. int a = (int)(.5+256*gaussian(sigma,0,i));
  4524. if(a)
  4525. {
  4526. for(j=0;j<a;j++)
  4527. {
  4528. CharDist[next+j]=(char) i;
  4529. }
  4530. next = next+j;
  4531. }
  4532. }
  4533. for(next=next;next<256;next++)
  4534. CharDist[next] = 0;
  4535. }
  4536. for(i=0;i<2048;i++)
  4537. {
  4538. Rand[i]=CharDist[rand() & 0xff];
  4539. }
  4540. for(i=0;i<16;i++)
  4541. {
  4542. blackclamp[i]=-CharDist[0];
  4543. whiteclamp[i]=-CharDist[0];
  4544. bothclamp[i]=-2*CharDist[0];
  4545. }
  4546. for(i=0;i<Height;i++)
  4547. {
  4548. UINT8 *Pos = Start + i *Pitch;
  4549. INT8 *Ref = Rand + (rand() & 0xff);
  4550. __asm
  4551. {
  4552. mov ecx, [Width]
  4553. mov esi,Pos
  4554. mov edi,Ref
  4555. xor eax,eax
  4556. nextset:
  4557. movq mm1,[esi+eax] // get the source
  4558. psubusb mm1,blackclamp // clamp both sides so we don't outrange adding noise
  4559. paddusb mm1,bothclamp
  4560. psubusb mm1,whiteclamp
  4561. movq mm2,[edi+eax] // get the noise for this line
  4562. paddb mm1,mm2 // add it in
  4563. movq [esi+eax],mm1 // store the result
  4564. add eax,8 // move to the next line
  4565. cmp eax, ecx
  4566. jl nextset
  4567. }
  4568. }
  4569. }