12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692 |
- /****************************************************************************
- *
- * Module Title : DeblockOpt.c
- *
- * Description : Optimized functions for deblocking
- *
- * AUTHOR : Yaowu Xu
- *
- *****************************************************************************
- * Revision History
- *
- * 1.04 YWX 21-Mar-02 bug fixed in functions using abs diff criteria
- * 1.03 YWX 15-Jun-01 Added new 7 tap filter in deblocking
- * 1.02 YWX 02-May-01 Changed to use sum of abs diff to replace variance
- * 1.01 YWX 17-Nov-00 Re-arranged loop inside deblockNonFilteredBand()
- * 1.00 YWX 02-Nov-00 Configuration baseline from old PPoptfunctions.c
- *
- *****************************************************************************
- */
- /****************************************************************************
- * Header Frames
- *****************************************************************************
- */
- #ifdef _MSC_VER
- #pragma warning(disable:4799)
- #pragma warning(disable:4731)
- #endif
- #define STRICT /* Strict type checking. */
- #include "postp.h"
- #include <stdio.h>
- #include <stdlib.h>
- /****************************************************************************
- * Module constants.
- *****************************************************************************
- */
- #if defined(_WIN32_WCE)
- #pragma pack(16)
- static short Eight128s[] = {128, 128, 128, 128,128, 128, 128, 128 };
- static short Eight64s[] = {64, 64, 64, 64, 64, 64, 64, 64 };
- static short EightThrees[]= {3, 3, 3, 3, 3, 3, 3, 3};
- static short EightFours[]= {4, 4, 4, 4, 4, 4, 4, 4};
- static short Four128s[] = {128, 128, 128, 128};
- static short Four64s[] = {64, 64, 64, 64 };
- static short FourThrees[]= {3, 3, 3, 3};
- static short FourFours[]= {4, 4, 4, 4};
- static short FourOnes[]= { 1, 1, 1, 1};
- static unsigned char Eight128c[] = {128, 128, 128, 128,128, 128, 128, 128 };
- #pragma pack()
- #else
- __declspec(align(16)) static short Eight128s[] = {128, 128, 128, 128,128, 128, 128, 128 };
- __declspec(align(16)) static short Eight64s[] = {64, 64, 64, 64, 64, 64, 64, 64 };
- __declspec(align(16)) static short EightThrees[]= {3, 3, 3, 3, 3, 3, 3, 3};
- __declspec(align(16)) static short EightFours[]= {4, 4, 4, 4, 4, 4, 4, 4};
- __declspec(align(16)) static short Four128s[] = {128, 128, 128, 128};
- __declspec(align(16)) static short Four64s[] = {64, 64, 64, 64 };
- __declspec(align(16)) static short FourThrees[]= {3, 3, 3, 3};
- __declspec(align(16)) static short FourFours[]= {4, 4, 4, 4};
- __declspec(align(16)) static short FourOnes[]= { 1, 1, 1, 1};
- __declspec(align(16)) static unsigned char Eight128c[] = {128, 128, 128, 128,128, 128, 128, 128 };
- #endif
- /****************************************************************************
- * Explicit Imports
- *****************************************************************************
- */
- extern UINT32 *DeblockLimitValuesV2;
- /****************************************************************************
- * Exported Global Variables
- *****************************************************************************
- */
- /****************************************************************************
- * Exported Functions
- *****************************************************************************
- */
- extern double gaussian(double sigma, double mu, double x);
- /****************************************************************************
- * Module Statics
- *****************************************************************************
- */
- /****************************************************************************
- *
- * ROUTINE : SetupBoundingValueArray_ForMMX
- *
- * INPUTS :
- *
- * OUTPUTS : None
- *
- * RETURNS : None
- *
- * FUNCTION : Applies a loop filter to the edge pixels of coded blocks.
- *
- * SPECIAL NOTES :
- *
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- INT32 *SetupDeblockValueArray_ForMMX(POSTPROC_INSTANCE *pbi, INT32 FLimit)
- {
- INT32 * BoundingValuePtr;
- /*
- Since the FiltBoundingValue array is currently only used in the generic version, we are going
- to reuse this memory for our own purposes.
- 2 longs for limit, 2 longs for _4ONES, 2 longs for LFABS_MMX, and 8 longs for temp work storage
- */
- BoundingValuePtr = (INT32 *)((UINT32)(&pbi->DeblockBoundingValue[256]) & 0xffffffe0);
- //expand for mmx code
- BoundingValuePtr[0] = BoundingValuePtr[1] = FLimit * 0x00010001;
- BoundingValuePtr[2] = BoundingValuePtr[3] = 0x00010001;
- BoundingValuePtr[4] = BoundingValuePtr[5] = 0x00040004;
- return BoundingValuePtr;
- }
- /****************************************************************************
- *
- * ROUTINE : DeblockLoopFilteredBand_MMX
- *
- * INPUTS : None
- *
- * OUTPUTS : None
- *
- * RETURNS : None
- *
- * FUNCTION : Filter both horizontal and vertical edge in a band
- *
- * SPECIAL NOTES :
- *
- * REFERENCE :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void DeblockLoopFilteredBand_MMX(
- POSTPROC_INSTANCE *pbi,
- UINT8 *SrcPtr,
- UINT8 *DesPtr,
- UINT32 PlaneLineStep,
- UINT32 FragAcross,
- UINT32 StartFrag,
- UINT32 *QuantScale
- )
- {
- UINT32 j;
- UINT32 CurrentFrag=StartFrag;
- UINT32 QStep;
- UINT8 *Src, *Des;
- UINT32 Var1, Var2;
- #if defined(_WIN32_WCE)
- #pragma pack(16)
- short QStepMmx[4];
- short FLimitMmx[4];
- short Rows[80];
- short NewRows[64];
- unsigned short Variance11[4];
- unsigned short Variance12[4];
- unsigned short Variance21[4];
- unsigned short Variance22[4];
- #pragma pack()
- #else
- __declspec(align(16)) short QStepMmx[4];
- __declspec(align(16)) short FLimitMmx[4];
- __declspec(align(16)) short Rows[80];
- __declspec(align(16)) short NewRows[64];
- __declspec(align(16)) unsigned short Variance11[4];
- __declspec(align(16)) unsigned short Variance12[4];
- __declspec(align(16)) unsigned short Variance21[4];
- __declspec(align(16)) unsigned short Variance22[4];
- #endif
- Src=SrcPtr;
- Des=DesPtr;
- while(CurrentFrag < StartFrag + FragAcross )
- {
-
- QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
- if( QStep > 3 )
- {
- QStepMmx[0] = (INT16)QStep;
- QStepMmx[1] = (INT16)QStep;
- QStepMmx[2] = (INT16)QStep;
- QStepMmx[3] = (INT16)QStep;
- __asm
- {
-
- /* Save the registers */
- push eax
- push ebp
- push ecx
- push edx
- push esi
- push edi
-
-
- /* Calculate the FLimit and store FLimit and QStep */
-
- movq mm0, QStepMmx /* mm0 = QStep */
- movq mm1, FourThrees /* mm1 = 03030303 */
- pmullw mm1, mm0 /* mm1 = QStep * 3 */
- pmullw mm1, mm0 /* mm1 = QStep * QStep * 3 */
-
- psrlw mm1, 5 /* mm1 = FLimit */
- movq [FLimitMmx], mm1 /* Save FLimit */
-
- /* Copy the data to the intermediate buffer */
-
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
-
- lea esi, NewRows /* esi = NewRows */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
-
- pxor mm7, mm7 /* Clear mm7 */
- sub edx, ecx /* edx = -Pitch */
-
- lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
- movq mm0, [eax + edx] /* mm0 = Src[-5*Pitch] */
-
- movq mm1, mm0 /* mm1 = mm0 */
- punpcklbw mm0, mm7 /* Lower Four -5 */
-
- movq mm2, [eax] /* mm2 = Src[-4*Pitch] */
- movq mm3, mm2 /* mm3 = mm2 */
-
- punpckhbw mm1, mm7 /* Higher Four -5 */
- movq [edi], mm0 /* Write Lower Four of -5 */
-
- punpcklbw mm2, mm7 /* Lower Four -4 */
- punpckhbw mm3, mm7 /* higher Four -4 */
-
- movq [edi+8], mm1 /* Write Higher Four of -5 */
- movq mm4, [eax + ecx] /* mm4 = Src[-3*Pitch] */
-
- movq [edi+16], mm2 /* Write Lower -4 */
- movq [edi+24], mm3 /* write hight -4 */
-
- movq mm5, mm4 /* mm5 = mm4 */
- punpcklbw mm4, mm7 /* lower four -3 */
-
- movq mm0, [eax + ecx *2] /* mm0 = Src[-2*Pitch] */
- punpckhbw mm5, mm7 /* higher four -3 */
-
- movq mm1, mm0 /* mm1 = mm0 */
- movq [edi+32], mm4 /* write Lower -3 */
-
- punpcklbw mm0, mm7 /* lower four -2 */
- lea eax, [eax + ecx *4] /* eax = Src */
-
- movq [edi+40], mm5 /* write Higher -3 */
- punpckhbw mm1, mm7 /* higher four -2 */
-
- movq mm2, [eax + edx] /* mm2 = Src[-Pitch] */
- movq [edi+48], mm0 /* lower -2 */
-
- movq mm3, mm2 /* mm3 = mm2 */
- punpcklbw mm2, mm7 /* lower -1 */
-
- movq [edi+56], mm1 /* higher -2 */
- punpckhbw mm3, mm7 /* Higher -1 */
-
- movq mm4, [eax] /* mm4 = Src[0] */
- movq [edi+64], mm2 /* Lower -1 */
-
- movq mm5, mm4 /* mm5 = mm4 */
- movq [edi+72], mm3 /* Higher -1 */
-
- punpcklbw mm4, mm7 /* lower 0 */
- punpckhbw mm5, mm7 /* higher 0 */
-
- movq mm0, [eax + ecx] /* mm0 = Src[Pitch] */
- movq [edi+80], mm4 /* write lower 0 */
-
- movq mm1, mm0 /* mm1 = mm0 */
- movq [edi+88], mm5 /* write higher 0 */
-
- punpcklbw mm0, mm7 /* lower 1 */
- punpckhbw mm1, mm7 /* higher 1 */
-
- movq mm2, [eax + ecx *2 ] /* mm2 = Src[2*Pitch] */
- lea eax, [eax + ecx *4] /* eax = Src + 4 * Pitch */
-
- movq mm3, mm2 /* mm3 = mm2 */
- movq [edi+96], mm0 /* write lower 1 */
-
- punpcklbw mm2, mm7 /* lower 2 */
- punpckhbw mm3, mm7 /* higher 2 */
-
- movq mm4, [eax + edx ] /* mm4 = Src[3*pitch] */
- movq [edi+104], mm1 /* wirte higher 1 */
-
- movq mm5, mm4 /* mm5 = mm4 */
- punpcklbw mm4, mm7 /* Low 3 */
-
- movq [edi+112], mm2 /* write lower 2 */
- movq [edi+120], mm3 /* write higher 2 */
-
- movq mm0, [eax] /* mm0 = Src[4*pitch] */
- punpckhbw mm5, mm7 /* high 3 */
-
- movq mm1, mm0 /* mm1=mm0 */
- movq [edi+128], mm4 /* low 3 */
-
- punpcklbw mm0, mm7 /* low 4 */
- punpckhbw mm1, mm7 /* high 4 */
-
- movq [edi+136], mm5 /* high 3 */
- movq [edi+144], mm0 /* low 4 */
-
- movq [edi+152], mm1 /* high 4 */
-
- /* done with copying everything to intermediate buffer */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
- /* mm7 = 0, mm3 = {128, 128, 128, 128} */
-
- pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
- psllw mm3, 15 /* mm3 = 8000800080008000 */
- psrlw mm3, 8 /* mm3 = 0080008000800080 */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- movq mm7, mm3 /* mm7 = mm3 */
- psrlw mm7, 7 /* mm7 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm7 /* (sum1 + 1) */
- paddw mm4, mm7 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
-
- movq mm7, FLimitMmx /* mm7 = FLimit */
- movq mm2, mm1 /* copy of Varinace 1*/
- movq mm6, mm5 /* Variance 2 */
- movq [Variance11], mm1 /* Save Variance1 */
- movq [Variance21], mm5 /* Save Variance2 */
- psubw mm1, mm7 /* Variance 1 < Flimit? */
-
- psubw mm5, mm7 /* Variance 2 < Flimit? */
- psraw mm2, 15 /* Variance 1 > 32768? */
- psraw mm6, 15 /* Vaiance 2 > 32768? */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- movq mm7, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn mm6, mm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm7 /* make copy of Pixel4 */
- psubusw mm7, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm7, mm4 /* abs(4 - 5) */
- psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm7, 15 /* FFFF/0000 for True/Flase */
- pand mm7, mm6
-
- /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* now lets look at the right four colomn */
-
- add edi, 8 /* offset 8 to right 4 cols */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- psrlw mm3, 7 /* mm3 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm3 /* (sum1 + 1) */
- paddw mm4, mm3 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
- movq [Variance12], mm1 /* Save Variance1 */
- movq [Variance22], mm5 /* Save Variance2 */
-
- movq mm3, FLimitMmx /* mm3 = FLimit */
- movq mm2, mm1 /* copy of Varinace 1*/
- movq mm6, mm5 /* Variance 2 */
- psubw mm1, mm3 /* Variance 1 < Flimit? */
-
- psubw mm5, mm3 /* Variance 2 < Flimit? */
- psraw mm2, 15 /* Variance 1 > 32768? */
- psraw mm6, 15 /* Vaiance 2 > 32768? */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- movq mm0, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn mm6, mm5 /* Variance2<32678 &&
- Variance1<Limit */
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm0 /* make copy of Pixel4 */
-
- psubusw mm0, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm0, mm4 /* abs(4 - 5) */
- psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm0, 15 /* FFFF/0000 for True/False */
- pand mm0, mm6
-
- sub edi, 8 /* offset edi back */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 and mm7 now are in use */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x4 */
-
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x5 */
-
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with left four columns */
- /* now do the righ four columns */
-
- add edi, 8 /* shift to right four column */
- add esi, 8 /* shift to right four column */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 now are in use */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
- /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
- /* Des[-w4]=Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x4 */
-
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x5 */
-
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with right four column */
- add edi, 8 /* shift edi to point x1 */
- sub esi, 8 /* shift esi back to x1 */
- mov ebp, Des /* the destination */
- lea ebp, [ebp + edx *4] /* point to des[-w4] */
-
- movq mm0, [esi]
- packuswb mm0, [esi + 8]
-
- movq [ebp], mm0 /* write des[-w4] */
-
- movq mm1, [esi + 16]
- packuswb mm1, [esi + 24]
-
- movq [ebp+ecx ], mm1 /* write des[-w3] */
-
- movq mm2, [esi + 32]
- packuswb mm2, [esi + 40]
-
- movq [ebp+ecx*2 ], mm2 /* write des[-w2] */
-
- movq mm3, [esi + 48]
- packuswb mm3, [esi + 56]
-
- lea ebp, [ebp+ecx*4] /* point to des[0] */
- movq [ebp+edx], mm3 /* write des[-w1] */
-
- movq mm0, [esi + 64]
- packuswb mm0, [esi + 72]
-
- movq [ebp ], mm0 /* write des[0] */
-
- movq mm1, [esi + 80]
- packuswb mm1, [esi + 88]
-
- movq [ebp+ecx], mm1 /* write des[w1] */
-
- movq mm2, [esi + 96]
- packuswb mm2, [esi + 104]
-
- movq [ebp+ecx*2], mm2 /* write des[w2] */
-
- movq mm3, [esi + 112]
- packuswb mm3, [esi + 120]
-
- lea ebp, [ebp+ecx*2] /* point to des[w4] */
- movq [ebp+ecx], mm3 /* write des[w3] */
-
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebp
- pop eax
-
-
- } /* end of the macro */
-
- Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
- Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
- Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
- pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
- }
- else
- {
- /* copy from src to des */
- __asm
- {
- push esi
- push edi
- push ecx
-
- mov esi, Src /* esi = Src */
- mov edi, Des /* edi = Des */
- push edx
- mov ecx, PlaneLineStep /* ecx = Pitch */
- xor edx, edx /* clear edx */
-
- sub edx, ecx /* edx = -Pitch */
- lea esi, [esi+edx*4] /* esi=Src-4*Pitch*/
-
- movq mm0, [esi] /* first row */
- movq [edi+edx*4], mm0 /* write first row */
-
- lea edi, [edi+edx*4] /* edi=Des-4*Pitch*/
- movq mm1, [esi+ecx] /* Src-3*Pitch */
- movq [edi+ecx], mm1 /* write second row */
- movq mm2, [esi+ecx*2] /* Src-2*Pitch */
- lea esi, [esi+ecx*4] /* Src */
- movq [edi+ecx*2], mm2 /* write third row */
- lea edi, [edi+ecx*4] /* Des */
- movq mm3, [esi+edx] /* Src-Pitch */
-
- movq [edi+edx], mm3 /* write fourth row */
- movq mm4, [esi] /* Src */
- movq mm5, [esi+ecx] /* Src+Pitch */
- movq [edi], mm4 /* write fifth rwo */
- movq mm6, [esi+ecx*2]
- lea esi, [esi+ecx*4] /* Src+pitch*4 */
- movq [edi+ecx], mm5 /* write the sixth rwo */
- movq [edi+ecx*2], mm6 /* write the seventh row */
- movq mm7, [esi+edx]
- lea edi, [edi+ecx*4] /* Des+Pitch*4 */
- movq [edi+edx], mm7 /* write the last row */
- pop edx
- pop ecx
- pop edi
- pop esi
- }
- }
-
- Src += 8;
- Des += 8;
- CurrentFrag ++;
- }
- Des -= ((PlaneLineStep + FragAcross)<<3);
- Des += 8;
- Src = Des;
- CurrentFrag = StartFrag ;
- while(CurrentFrag < StartFrag + FragAcross - 1)
- {
- QStep = QuantScale[pbi->FragQIndex[CurrentFrag+1]];
- if( QStep > 3 )
- {
- QStepMmx[0] = (INT16)QStep;
- QStepMmx[1] = (INT16)QStep;
- QStepMmx[2] = (INT16)QStep;
- QStepMmx[3] = (INT16)QStep;
- for( j=0; j<8;j++)
- {
- Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
- Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
- }
- __asm
- {
- /* Save the registers */
- push eax
- push ebp
- push ecx
- push edx
- push esi
- push edi
-
- /* Calculate the FLimit and store FLimit and QStep */
-
- movq mm0, QStepMmx /* mm0 = QStep */
- movq mm1, FourThrees /* mm1 = 03030303 */
- pmullw mm1, mm0 /* mm1 = QStep * 3 */
- pmullw mm1, mm0 /* mm1 = QStep * QStep * 3 */
-
- psrlw mm1, 5 /* mm1 = FLimit */
- movq [FLimitMmx], mm1 /* Save FLimit */
- /* setup the pointers to data */
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
-
- sub eax, 4 /* eax = Src-4 */
- lea esi, NewRows /* esi = NewRows */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- sub edx, ecx /* edx = -Pitch */
- /* Get the data to the intermediate buffer */
- movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
- movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
- movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
- lea eax, [eax+ecx*4] /* Go down four Rows */
- movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
- movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
-
- punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
- punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
- movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
- punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
- punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
- movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
- punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
- punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
-
- movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
- punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
- punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
- pxor mm7, mm7 /* clear mm7 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
- movq [edi+16], mm0 /* write 00 10 20 30 */
- punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
- movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
- movq [edi+32], mm5 /* write 01 11 21 31 */
-
- punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
- punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
- movq [edi+48], mm1 /* write 02 12 22 32 */
- movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
-
- movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
- movq [edi+64], mm0 /* write 03 13 23 33 */
- punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
- punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
- movq [edi+80], mm2 /* write 04 14 24 34 */
- punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
- punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
- movq [edi+96], mm3 /* write 05 15 25 35 */
-
- movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
- movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
- movq [edi+112], mm4 /* write 06 16 26 37 */
- movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
- lea eax, [eax+ ecx*4] /* Go down four rows */
- movq [edi+128], mm5 /* write 07 17 27 37 */
- movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
- movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
- punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
- punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
- movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
- punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
- punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
- movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
- punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
- punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
-
- movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
- punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
- punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
- movq [edi+24], mm0 /* write 40 50 60 70 */
- punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
- movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
- movq [edi+40], mm5 /* write 41 51 61 71 */
-
- punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
- punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
- movq [edi+56], mm1 /* write 42 52 62 72 */
- movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
-
- movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
- movq [edi+72], mm0 /* write 43 53 63 73 */
- punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
- punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
- movq [edi+88], mm2 /* write 44 54 64 74 */
- punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
- punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
- movq [edi+104], mm3 /* write 45 55 65 75 */
-
- movq [edi+120], mm4 /* write 46 56 66 76 */
- movq [edi+136], mm5 /* write 47 57 67 77 */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
- /* mm7 = 0, mm3 = {128, 128, 128, 128} */
-
- pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
- psllw mm3, 15 /* mm3 = 8000800080008000 */
- psrlw mm3, 8 /* mm3 = 0080008000800080 */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- movq mm7, mm3 /* mm7 = mm3 */
- psrlw mm7, 7 /* mm7 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm7 /* (sum1 + 1) */
- paddw mm4, mm7 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
-
- movq [Variance11], mm1 /* Save Variance1 */
- movq [Variance21], mm5 /* Save Variance2 */
- movq mm7, FLimitMmx /* mm7 = FLimit */
- movq mm2, mm1 /* copy of Variance 1*/
- movq mm6, mm5 /* copy of Variance 2*/
- psubw mm1, mm7 /* Variance 1 < Flimit? */
-
- psubw mm5, mm7 /* Variance 2 < Flimit? */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- psraw mm2, 15 /* Variance 1 > 32768 ? */
- psraw mm6, 15 /* Variance 2 > 32768 ? */
- movq mm7, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance 1 < Flimit &&
- Variance 1 < 32768 */
- pandn mm6, mm5 /* Variance 2 < Flimit &&
- Variance 2 < 32768 */
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm1 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm7 /* make copy of Pixel4 */
- psubusw mm7, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm7, mm4 /* abs(4 - 5) */
- psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm7, 15 /* FFFF/0000 for True/Flase */
- pand mm7, mm6
-
- /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* now lets look at the right four colomn */
-
- add edi, 8 /* offset 8 to right 4 cols */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- psrlw mm3, 7 /* mm3 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm3 /* (sum1 + 1) */
- paddw mm4, mm3 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
-
- movq [Variance12], mm1 /* Save Variance1 */
- movq [Variance22], mm5 /* Save Variance2 */
-
- movq mm3, FLimitMmx /* mm3 = FLimit */
- movq mm2, mm1 /* copy of Varinace 1*/
- movq mm6, mm5 /* Variance 2 */
- psubw mm1, mm3 /* Variance 1 < Flimit? */
-
- psubw mm5, mm3 /* Variance 2 < Flimit? */
- psraw mm6, 15 /* Variance 1 > 32768 */
-
- psraw mm2, 15 /* Variance 2 > 32768 */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- movq mm0, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn mm6, mm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm1 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm0 /* make copy of Pixel4 */
-
- psubusw mm0, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm0, mm4 /* abs(4 - 5) */
- psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm0, 15 /* FFFF/0000 for True/False */
- pand mm0, mm6
-
- sub edi, 8 /* offset edi back */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 and mm7 now are in use */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x4 */
-
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x5 */
-
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with left four columns */
- /* now do the righ four columns */
-
- add edi, 8 /* shift to right four column */
- add esi, 8 /* shift to right four column */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 now are in use */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
- /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
- /* Des[-w4]=Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x4 */
-
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x5 */
-
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with right four column */
- /* transpose */
- mov eax, Des /* the destination */
- add edi, 8 /* shift edi to point x1 */
- sub esi, 8 /* shift esi back to left x1 */
- sub eax, 4
- movq mm0, [esi] /* mm0 = 30 20 10 00 */
- movq mm1, [esi+16] /* mm1 = 31 21 11 01 */
- movq mm4, mm0 /* mm4 = 30 20 10 00 */
- punpcklwd mm0, mm1 /* mm0 = 11 10 01 00 */
- punpckhwd mm4, mm1 /* mm4 = 31 30 21 20 */
- movq mm2, [esi+32] /* mm2 = 32 22 12 02 */
- movq mm3, [esi+48] /* mm3 = 33 23 13 03 */
- movq mm5, mm2 /* mm5 = 32 22 12 02 */
- punpcklwd mm2, mm3 /* mm2 = 13 12 03 02 */
- punpckhwd mm5, mm3 /* mm5 = 33 32 23 22 */
- movq mm1, mm0 /* mm1 = 11 10 01 00 */
- punpckldq mm0, mm2 /* mm0 = 03 02 01 00 */
- movq [edi], mm0 /* write 00 01 02 03 */
- punpckhdq mm1, mm2 /* mm1 = 13 12 11 10 */
-
- movq mm0, mm4 /* mm0 = 31 30 21 20 */
- movq [edi+16], mm1 /* write 10 11 12 13 */
- punpckldq mm0, mm5 /* mm0 = 23 22 21 20 */
- punpckhdq mm4, mm5 /* mm4 = 33 32 31 30 */
- movq mm1, [esi+64] /* mm1 = 34 24 14 04 */
- movq mm2, [esi+80] /* mm2 = 35 25 15 05 */
- movq mm5, [esi+96] /* mm5 = 36 26 16 06 */
- movq mm6, [esi+112] /* mm6 = 37 27 17 07 */
-
- movq mm3, mm1 /* mm3 = 34 24 14 04 */
- movq mm7, mm5 /* mm7 = 36 26 16 06 */
- punpcklwd mm1, mm2 /* mm1 = 15 14 05 04 */
- punpckhwd mm3, mm2 /* mm3 = 35 34 25 24 */
- punpcklwd mm5, mm6 /* mm5 = 17 16 07 06 */
- punpckhwd mm7, mm6 /* mm7 = 37 36 27 26 */
- movq mm2, mm1 /* mm2 = 15 14 05 04 */
- movq mm6, mm3 /* mm6 = 35 34 25 24 */
- punpckldq mm1, mm5 /* mm1 = 07 06 05 04 */
- punpckhdq mm2, mm5 /* mm2 = 17 16 15 14 */
- punpckldq mm3, mm7 /* mm3 = 27 26 25 24 */
- punpckhdq mm6, mm7 /* mm6 = 37 36 35 34 */
-
- movq mm5, [edi] /* mm5 = 03 02 01 00 */
- packuswb mm5, mm1 /* mm5 = 07 06 05 04 03 02 01 00 */
-
- movq [eax], mm5 /* write 00 01 02 03 04 05 06 07 */
- movq mm7, [edi+16] /* mm7 = 13 12 11 10 */
- packuswb mm7, mm2 /* mm7 = 17 16 15 14 13 12 11 10 */
- movq [eax+ecx], mm7 /* write 10 11 12 13 14 15 16 17 */
- packuswb mm0, mm3 /* mm0 = 27 26 25 24 23 22 21 20 */
- packuswb mm4, mm6 /* mm4 = 37 36 35 34 33 32 31 30 */
-
- movq [eax+ecx*2], mm0 /* write 20 21 22 23 24 25 26 27 */
- lea eax, [eax+ecx*4] /* mov forward the desPtr */
- movq [eax+edx], mm4 /* write 30 31 32 33 34 35 36 37 */
- add edi, 8 /* move to right four column */
- add esi, 8 /* move to right x1 */
- movq mm0, [esi] /* mm0 = 70 60 50 40 */
- movq mm1, [esi+16] /* mm1 = 71 61 51 41 */
- movq mm4, mm0 /* mm4 = 70 60 50 40 */
- punpcklwd mm0, mm1 /* mm0 = 51 50 41 40 */
- punpckhwd mm4, mm1 /* mm4 = 71 70 61 60 */
- movq mm2, [esi+32] /* mm2 = 72 62 52 42 */
- movq mm3, [esi+48] /* mm3 = 73 63 53 43 */
- movq mm5, mm2 /* mm5 = 72 62 52 42 */
- punpcklwd mm2, mm3 /* mm2 = 53 52 43 42 */
- punpckhwd mm5, mm3 /* mm5 = 73 72 63 62 */
- movq mm1, mm0 /* mm1 = 51 50 41 40 */
- punpckldq mm0, mm2 /* mm0 = 43 42 41 40 */
- movq [edi], mm0 /* write 40 41 42 43 */
- punpckhdq mm1, mm2 /* mm1 = 53 52 51 50 */
-
- movq mm0, mm4 /* mm0 = 71 70 61 60 */
- movq [edi+16], mm1 /* write 50 51 52 53 */
- punpckldq mm0, mm5 /* mm0 = 63 62 61 60 */
- punpckhdq mm4, mm5 /* mm4 = 73 72 71 70 */
- movq mm1, [esi+64] /* mm1 = 74 64 54 44 */
- movq mm2, [esi+80] /* mm2 = 75 65 55 45 */
- movq mm5, [esi+96] /* mm5 = 76 66 56 46 */
- movq mm6, [esi+112] /* mm6 = 77 67 57 47 */
-
- movq mm3, mm1 /* mm3 = 74 64 54 44 */
- movq mm7, mm5 /* mm7 = 76 66 56 46 */
- punpcklwd mm1, mm2 /* mm1 = 55 54 45 44 */
- punpckhwd mm3, mm2 /* mm3 = 75 74 65 64 */
- punpcklwd mm5, mm6 /* mm5 = 57 56 47 46 */
- punpckhwd mm7, mm6 /* mm7 = 77 76 67 66 */
- movq mm2, mm1 /* mm2 = 55 54 45 44 */
- movq mm6, mm3 /* mm6 = 75 74 65 64 */
- punpckldq mm1, mm5 /* mm1 = 47 46 45 44 */
- punpckhdq mm2, mm5 /* mm2 = 57 56 55 54 */
- punpckldq mm3, mm7 /* mm3 = 67 66 65 64 */
- punpckhdq mm6, mm7 /* mm6 = 77 76 75 74 */
-
- movq mm5, [edi] /* mm5 = 43 42 41 40 */
- packuswb mm5, mm1 /* mm5 = 47 46 45 44 43 42 41 40 */
-
- movq [eax], mm5 /* write 40 41 42 43 44 45 46 47 */
- movq mm7, [edi+16] /* mm7 = 53 52 51 50 */
- packuswb mm7, mm2 /* mm7 = 57 56 55 54 53 52 51 50 */
- movq [eax+ecx], mm7 /* write 50 51 52 53 54 55 56 57 */
- packuswb mm0, mm3 /* mm0 = 67 66 65 64 63 62 61 60 */
- packuswb mm4, mm6 /* mm4 = 77 76 75 74 73 72 71 70 */
-
- movq [eax+ecx*2], mm0 /* write 60 61 62 63 64 65 66 67 */
- lea eax, [eax+ecx*4] /* mov forward the desPtr */
- movq [eax+edx], mm4 /* write 70 71 72 73 74 75 76 77 */
-
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebp
- pop eax
- }
- Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
- Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
- Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
- pbi->FragmentVariances[CurrentFrag + 1] += Var2;
- }
- CurrentFrag ++;
- Src += 8;
- Des += 8;
- }
- }
- /****************************************************************************
- *
- * ROUTINE : DeblockNonFilteredBand_MMX
- *
- * INPUTS : None
- *
- * OUTPUTS : None
- *
- * RETURNS : None
- *
- * FUNCTION : Filter both horizontal and vertical edge in a band
- *
- * SPECIAL NOTES :
- *
- * REFERENCE :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void DeblockNonFilteredBand_MMX(
- POSTPROC_INSTANCE *pbi,
- UINT8 *SrcPtr,
- UINT8 *DesPtr,
- UINT32 PlaneLineStep,
- UINT32 FragAcross,
- UINT32 StartFrag,
- UINT32 *QuantScale
- )
- {
- UINT32 j;
- UINT32 CurrentFrag=StartFrag;
- UINT32 QStep;
- UINT32 LoopFLimit;
- UINT8 *Src, *Des;
- UINT32 Var1, Var2;
- #if defined(_WIN32_WCE)
- #pragma pack(16)
- short QStepMmx[4];
- short FLimitMmx[4];
- short LoopFLimitMmx[4];
- short Rows[80];
- short NewRows[64];
- short LoopFilteredValuesUp[4];
- short LoopFilteredValuesDown[4];
- unsigned short Variance11[4];
- unsigned short Variance12[4];
- unsigned short Variance21[4];
- unsigned short Variance22[4];
- #pragma pack()
- #else
- __declspec(align(16)) short QStepMmx[4];
- __declspec(align(16)) short FLimitMmx[4];
- __declspec(align(16)) short LoopFLimitMmx[4];
- __declspec(align(16)) short Rows[80];
- __declspec(align(16)) short NewRows[64];
- __declspec(align(16)) short LoopFilteredValuesUp[4];
- __declspec(align(16)) short LoopFilteredValuesDown[4];
- __declspec(align(16)) unsigned short Variance11[4];
- __declspec(align(16)) unsigned short Variance12[4];
- __declspec(align(16)) unsigned short Variance21[4];
- __declspec(align(16)) unsigned short Variance22[4];
- #endif
- LoopFLimit = DeblockLimitValuesV2[pbi->FrameQIndex];
- LoopFLimitMmx[0] = (INT16)LoopFLimit;
- LoopFLimitMmx[1] = (INT16)LoopFLimit;
- LoopFLimitMmx[2] = (INT16)LoopFLimit;
- LoopFLimitMmx[3] = (INT16)LoopFLimit;
- while(CurrentFrag < StartFrag + FragAcross )
- {
- Src=SrcPtr+8*(CurrentFrag-StartFrag);
- Des=DesPtr+8*(CurrentFrag-StartFrag);
- QStep = QuantScale[ pbi->FragQIndex[CurrentFrag+FragAcross]];
- __asm
- {
-
- push eax
- push ebp
-
- push ecx
- push edx
- push esi
- push edi
- /* Calculate the FLimit and store FLimit and QStep */
- /* Copy the data to the intermediate buffer */
- mov eax, QStep
- xor edx, edx /* clear edx */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- pcmpeqw mm6, mm6
-
-
- movd mm5, eax
- mov eax, Src /* eax = Src */
-
- psrlw mm6, 14 /* mm6 = 3, 3, 3, 3*/
- punpcklwd mm5, mm5
- lea esi, NewRows /* esi = NewRows */
- punpckldq mm5, mm5
- sub edx, ecx /* edx = - Pitch */
- pmullw mm6, mm5 /* Qstep * 3 */
- movq QStepMmx, mm5
- lea edi, Rows /* edi = Rows */
- pxor mm7, mm7 /* Clear mm7 */
- pmullw mm6, mm5
-
- lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
- movq mm0, [eax + edx] /* mm0 = Src[-5*Pitch] */
-
- movq mm1, mm0 /* mm1 = mm0 */
- punpcklbw mm0, mm7 /* Lower Four -5 */
- psrlw mm6, 5
- movq [FLimitMmx], mm6
-
- movq mm2, [eax] /* mm2 = Src[-4*Pitch] */
- punpckhbw mm1, mm7 /* Higher Four -5 */
- movq mm3, mm2 /* mm3 = mm2 */
- punpcklbw mm2, mm7 /* Lower Four -4 */
- movq [edi], mm0 /* Write Lower Four of -5 */
- punpckhbw mm3, mm7 /* higher Four -4 */
-
- movq [edi+8], mm1 /* Write Higher Four of -5 */
- movq mm4, [eax + ecx] /* mm4 = Src[-3*Pitch] */
-
- movq [edi+16], mm2 /* Write Lower -4 */
- movq [edi+24], mm3 /* write hight -4 */
-
- movq mm5, mm4 /* mm5 = mm4 */
- punpcklbw mm4, mm7 /* lower four -3 */
-
- movq mm0, [eax + ecx *2] /* mm0 = Src[-2*Pitch] */
- punpckhbw mm5, mm7 /* higher four -3 */
-
- movq mm1, mm0 /* mm1 = mm0 */
- movq [edi+32], mm4 /* write Lower -3 */
-
- punpcklbw mm0, mm7 /* lower four -2 */
- lea eax, [eax + ecx *4] /* eax = Src */
-
- movq [edi+40], mm5 /* write Higher -3 */
- punpckhbw mm1, mm7 /* higher four -2 */
-
- movq mm2, [eax + edx] /* mm2 = Src[-Pitch] */
- movq [edi+48], mm0 /* lower -2 */
-
- movq mm3, mm2 /* mm3 = mm2 */
- punpcklbw mm2, mm7 /* lower -1 */
-
- movq [edi+56], mm1 /* higher -2 */
- punpckhbw mm3, mm7 /* Higher -1 */
-
- movq mm4, [eax] /* mm4 = Src[0] */
- movq [edi+64], mm2 /* Lower -1 */
-
- movq mm5, mm4 /* mm5 = mm4 */
- movq [edi+72], mm3 /* Higher -1 */
-
- punpcklbw mm4, mm7 /* lower 0 */
- punpckhbw mm5, mm7 /* higher 0 */
-
- movq mm0, [eax + ecx] /* mm0 = Src[Pitch] */
- movq [edi+80], mm4 /* write lower 0 */
-
- movq mm1, mm0 /* mm1 = mm0 */
- movq [edi+88], mm5 /* write higher 0 */
-
- punpcklbw mm0, mm7 /* lower 1 */
- punpckhbw mm1, mm7 /* higher 1 */
-
- movq mm2, [eax + ecx *2 ] /* mm2 = Src[2*Pitch] */
- lea eax, [eax + ecx *4] /* eax = Src + 4 * Pitch */
-
- movq mm3, mm2 /* mm3 = mm2 */
- movq [edi+96], mm0 /* write lower 1 */
-
- punpcklbw mm2, mm7 /* lower 2 */
- punpckhbw mm3, mm7 /* higher 2 */
-
- movq mm4, [eax + edx ] /* mm4 = Src[3*pitch] */
- movq [edi+104], mm1 /* wirte higher 1 */
-
- movq mm5, mm4 /* mm5 = mm4 */
- punpcklbw mm4, mm7 /* Low 3 */
-
- movq [edi+112], mm2 /* write lower 2 */
- movq [edi+120], mm3 /* write higher 2 */
-
- movq mm0, [eax] /* mm0 = Src[4*pitch] */
- punpckhbw mm5, mm7 /* high 3 */
-
- movq mm1, mm0 /* mm1=mm0 */
- movq [edi+128], mm4 /* low 3 */
-
- punpcklbw mm0, mm7 /* low 4 */
- punpckhbw mm1, mm7 /* high 4 */
-
- movq [edi+136], mm5 /* high 3 */
- movq [edi+144], mm0 /* low 4 */
-
- movq [edi+152], mm1 /* high 4 */
-
- /*
- mov eax, Des
- lea eax, [eax+edx*4]
- movq mm2, [eax]
- movq mm2, [eax+ecx]
- movq mm2, [eax+ecx*2]
- lea eax, [eax+ecx*4]
- movq mm2, [eax+edx]
- movq mm2, [eax]
- movq mm2, [eax+ecx]
- movq mm2, [eax+ecx*2]
- lea eax, [eax+ecx*4]
- movq mm2, [eax+edx]
- movq mm2, [eax]
-
- */
-
- /* done with copying everything to intermediate buffer */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
- /* mm7 = 0, mm3 = {128, 128, 128, 128} */
-
- pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
- psllw mm3, 15 /* mm3 = 8000800080008000 */
- psrlw mm3, 8 /* mm3 = 0080008000800080 */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- movq mm7, mm3 /* mm7 = mm3 */
- psrlw mm7, 7 /* mm7 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm7 /* (sum1 + 1) */
- paddw mm4, mm7 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
-
- movq mm7, FLimitMmx /* mm7 = FLimit */
- movq mm2, mm1 /* copy of Varinace 1*/
- movq mm6, mm5 /* Variance 2 */
- movq [Variance11], mm1 /* Save Variance1 */
- movq [Variance21], mm5 /* Save Variance2 */
- psubw mm1, mm7 /* Variance 1 < Flimit? */
-
- psubw mm5, mm7 /* Variance 2 < Flimit? */
- psraw mm2, 15 /* Variance 1 > 32768? */
- psraw mm6, 15 /* Vaiance 2 > 32768? */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- movq mm7, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn mm6, mm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm7 /* make copy of Pixel4 */
- psubusw mm7, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm7, mm4 /* abs(4 - 5) */
- psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm7, 15 /* FFFF/0000 for True/Flase */
- pand mm7, mm6
-
- /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* now lets look at the right four colomn */
-
- add edi, 8 /* offset 8 to right 4 cols */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- psrlw mm3, 7 /* mm3 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm3 /* (sum1 + 1) */
- paddw mm4, mm3 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
- movq [Variance12], mm1 /* Save Variance1 */
- movq [Variance22], mm5 /* Save Variance2 */
-
- movq mm3, FLimitMmx /* mm3 = FLimit */
- movq mm2, mm1 /* copy of Varinace 1*/
- movq mm6, mm5 /* Variance 2 */
- psubw mm1, mm3 /* Variance 1 < Flimit? */
-
- psubw mm5, mm3 /* Variance 2 < Flimit? */
- psraw mm2, 15 /* Variance 1 > 32768? */
- psraw mm6, 15 /* Vaiance 2 > 32768? */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- movq mm0, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn mm6, mm5 /* Variance2<32678 &&
- Variance1<Limit */
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm6 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm0 /* make copy of Pixel4 */
-
- psubusw mm0, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm0, mm4 /* abs(4 - 5) */
- psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm0, 15 /* FFFF/0000 for True/False */
- pand mm0, mm6
-
- sub edi, 8 /* offset edi back */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 and mm7 now are in use */
-
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
-
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
-
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
-
- psubw mm4, mm5 /* -=x4 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x4 */
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
-
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x5 */
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with left four columns */
- /* now do the righ four columns */
-
- add edi, 8 /* shift to right four column */
- add esi, 8 /* shift to right four column */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 now are in use */
-
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm4, mm4 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
-
-
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
- /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
- /* Des[-w4]=Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
-
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x4 */
- movq [esi+48], mm4 /* write new x4 */
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x5 */
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with right four column */
- add edi, 8 /* shift edi to point x1 */
- sub esi, 8 /* shift esi back to x1 */
- mov ebp, Des /* the destination */
- lea ebp, [ebp + edx *4] /* point to des[-w4] */
-
- movq mm0, [esi]
- packuswb mm0, [esi + 8]
-
- movq [ebp], mm0 /* write des[-w4] */
-
- movq mm1, [esi + 16]
- packuswb mm1, [esi + 24]
-
- movq [ebp+ecx ], mm1 /* write des[-w3] */
-
- movq mm2, [esi + 32]
- packuswb mm2, [esi + 40]
-
- movq [ebp+ecx*2 ], mm2 /* write des[-w2] */
-
- movq mm3, [esi + 48]
- packuswb mm3, [esi + 56]
-
- lea ebp, [ebp+ecx*4] /* point to des[0] */
- movq [ebp+edx], mm3 /* write des[-w1] */
-
- movq mm0, [esi + 64]
- packuswb mm0, [esi + 72]
-
- movq [ebp ], mm0 /* write des[0] */
-
- movq mm1, [esi + 80]
- packuswb mm1, [esi + 88]
-
- movq [ebp+ecx], mm1 /* write des[w1] */
-
- movq mm2, [esi + 96]
- packuswb mm2, [esi + 104]
-
- movq [ebp+ecx*2], mm2 /* write des[w2] */
-
- movq mm3, [esi + 112]
- packuswb mm3, [esi + 120]
-
- lea ebp, [ebp+ecx*2] /* point to des[w4] */
- movq [ebp+ecx], mm3 /* write des[w3] */
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebp
- pop eax
-
- } /* end of the macro */
-
- Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
- Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
- Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
- pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
-
- if(CurrentFrag==StartFrag)
- CurrentFrag++;
- else
- {
-
- Des=DesPtr-8*PlaneLineStep+8*(CurrentFrag-StartFrag);
- Src=Des;
- QStep = QuantScale[pbi->FragQIndex[CurrentFrag]];
- for( j=0; j<8;j++)
- {
- Rows[j] = (short) (Src[-5 +j*PlaneLineStep]);
- Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
- }
- __asm
- {
- /* Save the registers */
- push eax
- push ebp
- /* Calculate the FLimit and store FLimit and QStep */
- mov eax, QStep /* get QStep */
- movd mm0, eax /* mm0 = 0, 0, 0, Q */
- push ecx
-
- punpcklwd mm0, mm0 /* mm0 = 0, 0, Q, Q */
- movq mm1, FourThrees /* mm1 = 03 03 03 03 */
- push edx
-
- punpckldq mm0, mm0 /* mm0 = Q, Q, Q, Q */
- movq QStepMmx, mm0 /* write the Q step */
- push esi
- pmullw mm1, mm0 /* mm1 = QStep * 3 */
- pmullw mm1, mm0 /* mm1 = QStep * QStep * 3 */
- push edi
-
-
- psrlw mm1, 5 /* mm1 = FLimit */
- movq [FLimitMmx], mm1 /* Save FLimit */
- /* setup the pointers to data */
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
-
- sub eax, 4 /* eax = Src-4 */
- lea esi, NewRows /* esi = NewRows */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- sub edx, ecx /* edx = -Pitch */
- /* Get the data to the intermediate buffer */
- movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
- movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
- movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
- lea eax, [eax+ecx*4] /* Go down four Rows */
- movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
- movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
-
- punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
- punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
- movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
- punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
- punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
- movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
- punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
- punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
-
- movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
- punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
- punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
- pxor mm7, mm7 /* clear mm7 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
- movq [edi+16], mm0 /* write 00 10 20 30 */
- punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
- movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
- movq [edi+32], mm5 /* write 01 11 21 31 */
-
- punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
- punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
- movq [edi+48], mm1 /* write 02 12 22 32 */
- movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
-
- movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
- movq [edi+64], mm0 /* write 03 13 23 33 */
- punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
- punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
- movq [edi+80], mm2 /* write 04 14 24 34 */
- punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
- punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
- movq [edi+96], mm3 /* write 05 15 25 35 */
-
- movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
- movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
- movq [edi+112], mm4 /* write 06 16 26 37 */
- movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
- lea eax, [eax+ ecx*4] /* Go down four rows */
- movq [edi+128], mm5 /* write 07 17 27 37 */
- movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
- movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
- punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
- punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
- movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
- punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
- punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
- movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
- punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
- punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
-
- movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
- punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
- punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
- movq [edi+24], mm0 /* write 40 50 60 70 */
- punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
- movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
- movq [edi+40], mm5 /* write 41 51 61 71 */
-
- punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
- punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
- movq [edi+56], mm1 /* write 42 52 62 72 */
- movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
-
- movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
- movq [edi+72], mm0 /* write 43 53 63 73 */
- punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
- punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
- movq [edi+88], mm2 /* write 44 54 64 74 */
- punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
- punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
- movq [edi+104], mm3 /* write 45 55 65 75 */
-
- movq [edi+120], mm4 /* write 46 56 66 76 */
- movq [edi+136], mm5 /* write 47 57 67 77 */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- /* we use mm0,mm1,mm2 for 1234 and mm4, mm5, mm6 for 5-8 */
- /* mm7 = 0, mm3 = {128, 128, 128, 128} */
-
- pcmpeqw mm3, mm3 /* mm3 = FFFFFFFFFFFFFFFF */
- psllw mm3, 15 /* mm3 = 8000800080008000 */
- psrlw mm3, 8 /* mm3 = 0080008000800080 */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- movq mm7, mm3 /* mm7 = mm3 */
- psrlw mm7, 7 /* mm7 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm7 /* (sum1 + 1) */
- paddw mm4, mm7 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
-
- movq [Variance11], mm1 /* Save Variance1 */
- movq [Variance21], mm5 /* Save Variance2 */
- movq mm7, FLimitMmx /* mm7 = FLimit */
- movq mm2, mm1 /* copy of Variance 1*/
- movq mm6, mm5 /* copy of Variance 2*/
- psubw mm1, mm7 /* Variance 1 < Flimit? */
-
- psubw mm5, mm7 /* Variance 2 < Flimit? */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- psraw mm2, 15 /* Variance 1 > 32768 ? */
- psraw mm6, 15 /* Variance 2 > 32768 ? */
- movq mm7, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance 1 < Flimit &&
- Variance 1 < 32768 */
- pandn mm6, mm5 /* Variance 2 < Flimit &&
- Variance 2 < 32768 */
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm1 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm7 /* make copy of Pixel4 */
- psubusw mm7, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm7, mm4 /* abs(4 - 5) */
- psubw mm7, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm7, 15 /* FFFF/0000 for True/Flase */
- pand mm7, mm6
-
- /* mm7 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* now lets look at the right four colomn */
-
- add edi, 8 /* offset 8 to right 4 cols */
-
- movq mm2, [edi+16] /* Pixel 1 */
- movq mm6, [edi+80] /* Pixel 5 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- movq mm0, mm2 /* mm0 = pixel 1 */
- movq mm4, mm6 /* mm4 = pixel 5 */
-
- pmullw mm2, mm2 /* mm2 = pixel1 * pixel1 */
- pmullw mm6, mm6 /* mm6 = pixel5 * pixel5 */
-
- movq mm1, mm2 /* mm1 = pixel1^2 */
- movq mm5, mm6 /* mm5 = pixel5^2 */
-
- movq mm2, [edi+32] /* Pixel 2 */
- movq mm6, [edi+96] /* Pixel 6 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 2 */
- paddw mm4, mm6 /* mm4 += pixel 6 */
-
- pmullw mm2, mm2 /* mm2 = pixel2^2 */
- pmullw mm6, mm6 /* mm6 = pixel6^2 */
-
- paddw mm1, mm2 /* mm1 += pixel2^2 */
- paddw mm5, mm6 /* mm5 += pixel6^2 */
-
- movq mm2, [edi+48] /* Pixel 3 */
- movq mm6, [edi+112] /* Pixel 7 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 3 */
- paddw mm4, mm6 /* mm4 += pixel 7 */
-
- pmullw mm2, mm2 /* mm2 = pixel3^2 */
- pmullw mm6, mm6 /* mm6 = pixel7^2 */
-
- paddw mm1, mm2 /* mm1 += pixel3^2 */
- paddw mm5, mm6 /* mm5 += pixel7^2 */
-
- movq mm2, [edi+64] /* Pixel 4 */
- movq mm6, [edi+128] /* Pixel 8 */
-
- psubw mm2, mm3 /* mm2 -=128 */
- psubw mm6, mm3 /* mm6 -=128 */
-
- paddw mm0, mm2 /* mm0 += pixel 4 */
- paddw mm4, mm6 /* mm4 += pixel 8 */
-
- pmullw mm2, mm2 /* mm2 = pixel4^2 */
- pmullw mm6, mm6 /* mm6 = pixel8^2 */
-
- paddw mm1, mm2 /* mm1 = pixel4^2 */
- paddw mm5, mm6 /* mm5 = pixel8^2 */
-
- /* mm0 = x1^2 + x2^2 + x3^2 + x4^2 */
- /* mm1 = x1 + x2 + x3 + x4 */
- /* mm4 = x5^2 + x6^2 + x7^2 + x8^2 */
- /* mm5 = x5 + x6 + x7 + x8 */
-
- psrlw mm3, 7 /* mm3 = 0001000100010001 */
-
- movq mm2, mm0 /* make copy of sum1 */
- movq mm6, mm4 /* make copy of sum2 */
-
- paddw mm0, mm3 /* (sum1 + 1) */
- paddw mm4, mm3 /* (sum2 + 1) */
-
- psraw mm2, 1 /* sum1 /2 */
- psraw mm6, 1 /* sum2 /2 */
-
- psraw mm0, 1 /* (sum1 + 1)/2 */
- psraw mm4, 1 /* (sum2 + 1)/2 */
-
- pmullw mm2, mm0 /* (sum1)/2*(sum1+1)/2 */
- pmullw mm6, mm4 /* (sum2)/2*(sum2+1)/2 */
-
- psubw mm1, mm2 /* Variance 1 */
- psubw mm5, mm6 /* Variance 2 */
-
- movq [Variance12], mm1 /* Save Variance1 */
- movq [Variance22], mm5 /* Save Variance2 */
-
- movq mm3, FLimitMmx /* mm3 = FLimit */
- movq mm2, mm1 /* copy of Varinace 1*/
- movq mm6, mm5 /* Variance 2 */
- psubw mm1, mm3 /* Variance 1 < Flimit? */
-
- psubw mm5, mm3 /* Variance 2 < Flimit? */
- psraw mm6, 15 /* Variance 1 > 32768 */
-
- psraw mm2, 15 /* Variance 2 > 32768 */
- psraw mm1, 15 /* FFFF/0000 for true/false */
-
- psraw mm5, 15 /* FFFF/0000 for true/false */
- movq mm0, [edi+64] /* mm0 = Pixel 4 */
- pandn mm2, mm1 /* Variance1<32678 &&
- Variance1<Limit */
- pandn mm6, mm5 /* Variance2<32678 &&
- Variance1<Limit */
-
- movq mm4, [edi+80] /* mm4 = Pixel 5 */
- pand mm6, mm2 /* mm1 = Variance1 < Flimit */
- /* &&Variance2 < Flimit */
- movq mm2, mm0 /* make copy of Pixel4 */
-
- psubusw mm0, mm4 /* 4 - 5 */
- psubusw mm4, mm2 /* 5 - 4 */
-
- por mm0, mm4 /* abs(4 - 5) */
- psubw mm0, QStepMmx /* abs(4-5)<QStepMmx ? */
-
- psraw mm0, 15 /* FFFF/0000 for True/False */
- pand mm0, mm6
-
- sub edi, 8 /* offset edi back */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 and mm7 now are in use */
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
- /* Des[-w4] = (((sum + x1) << 1) - (x4 - x5)) >> 4; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x4 */
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x5 */
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with left four columns */
- /* now do the righ four columns */
-
- add edi, 8 /* shift to right four column */
- add esi, 8 /* shift to right four column */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 now are in use */
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
- /* Let's do the filtering now */
- /* p1 = (abs(Src[-4] - Src[-5]) < QStep ) ? Src[-5] : Src[-4]; */
- /* p2 = (abs(Src[+3] - Src[+4]) < QStep ) ? Src[+4] : Src[+3]; */
-
- movq mm5, [edi] /* mm5 = -5 */
- movq mm4, [edi + 16] /* mm4 = -4 */
-
- movq mm3, mm4 /* copy of -4 */
- movq mm6, mm5 /* copy of -5 */
-
- psubusw mm4, mm6 /* mm4 = [-4] - [-5] */
- psubusw mm5, mm3 /* mm5 = [-5] - [-4] */
-
- por mm4, mm5 /* abs([-4]-[-5] ) */
- psubw mm4, QStepMmx /* abs([-4]-[-5] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm1, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm1, mm3 /* */
-
- por mm1, mm4 /* mm1 = p1 */
-
- /* now find P2 */
-
- movq mm4, [edi+128] /* mm4 = [3] */
- movq mm5, [edi+144] /* mm5 = [4] */
-
- movq mm3, mm4 /* copy of 3 */
- movq mm6, mm5 /* copy of 4 */
-
- psubusw mm4, mm6 /* mm4 = [3] - [4] */
- psubusw mm5, mm3 /* mm5 = [4] - [3] */
-
- por mm4, mm5 /* abs([3]-[4] ) */
- psubw mm4, QStepMmx /* abs([3]-[4] )<QStep? */
-
- psraw mm4, 15 /* FFFF/0000 for True/False */
- movq mm2, mm4 /* copy of the mm4 */
-
- pand mm4, mm6 /* */
- pandn mm2, mm3 /* */
-
- por mm2, mm4 /* mm2 = p2 */
-
- /* psum = p1 + p1 + p1 + v[1] + v[2] + v[3] + v[4] + 4; */
- /* Des[-w4] = (((psum + v[1]) << 1) - (v[4] - v[5])) >> 4; */
- /* Des[-w4]=Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm3, mm1 /* mm3 = p1 */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- movq mm4, mm3 /* mm4 = mm3 */
-
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
-
- psllw mm4, 1 /* mm4 = (sum+x1)<<1 */
- psubw mm4, [edi+64] /* mm4 = (sum+x1)<<1-x4 */
-
- paddw mm4, [edi+80] /* mm4 = (sum+x1)<<1-x4+x5 */
- psraw mm4, 4 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)<<1-x5+x6)>>4 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+80] /* mm4 =(sum+x2)<<1-x5 */
- paddw mm4, [edi+96] /* mm4 =(sum+x2)<<1-x5+x6 */
-
- psraw mm4, 4 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])<<1-x[6]+x[7])>>4 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- paddw mm4, mm4 /* mm4 <<= 1 */
-
- psubw mm4, [edi+96] /* mm4 =(sum+x3)<<1-x6 */
- paddw mm4, [edi+112] /* mm4 =(sum+x3)<<1-x6+x7 */
-
- psraw mm4, 4 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- paddw mm4, mm4 /* mm4 *=2 */
-
- paddw mm4, mm1 /* += p1 */
- psubw mm4, [edi+16] /* -= x1 */
-
- psubw mm4, [edi+112] /* -= x7 */
- paddw mm4, [edi+128] /* += x8 */
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x4 */
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)<<1+x1-x2-x8+p2)>>4 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- paddw mm4, mm4 /* mm4 *= 2 */
-
- paddw mm4, [edi+16] /* += x1 */
- psubw mm4, [edi+32] /* -= x2 */
-
- psubw mm4, [edi+128] /* -= x8 */
- paddw mm4, mm2 /* += p2 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x5 */
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x5 */
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)<<1 + x2-x3)>>4 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+32] /* +=x2 */
- psubw mm4, [edi+48] /* -=x3 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x6 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x6 */
-
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = ((sum+x7)<<1 + x3-x4)>>4 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+48] /* +=x3 */
- psubw mm4, [edi+64] /* -=x4 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x7 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x7 */
-
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)<<1 + x4-x5)>>4 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- paddw mm4, mm4 /* mm4 *= 2*/
-
- paddw mm4, [edi+64] /* +=x4 */
- psubw mm4, [edi+80] /* -=x5 */
-
- psraw mm4, 4 /* >>=4 */
- psubw mm4, mm5 /* -=x8 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x8 */
-
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with right four column */
- /* transpose */
- mov eax, Des /* the destination */
- add edi, 8 /* shift edi to point x1 */
- sub esi, 8 /* shift esi back to left x1 */
- sub eax, 4
- movq mm0, [esi] /* mm0 = 30 20 10 00 */
- movq mm1, [esi+16] /* mm1 = 31 21 11 01 */
- movq mm4, mm0 /* mm4 = 30 20 10 00 */
- punpcklwd mm0, mm1 /* mm0 = 11 10 01 00 */
- punpckhwd mm4, mm1 /* mm4 = 31 30 21 20 */
- movq mm2, [esi+32] /* mm2 = 32 22 12 02 */
- movq mm3, [esi+48] /* mm3 = 33 23 13 03 */
- movq mm5, mm2 /* mm5 = 32 22 12 02 */
- punpcklwd mm2, mm3 /* mm2 = 13 12 03 02 */
- punpckhwd mm5, mm3 /* mm5 = 33 32 23 22 */
- movq mm1, mm0 /* mm1 = 11 10 01 00 */
- punpckldq mm0, mm2 /* mm0 = 03 02 01 00 */
- movq [edi], mm0 /* write 00 01 02 03 */
- punpckhdq mm1, mm2 /* mm1 = 13 12 11 10 */
-
- movq mm0, mm4 /* mm0 = 31 30 21 20 */
- movq [edi+16], mm1 /* write 10 11 12 13 */
- punpckldq mm0, mm5 /* mm0 = 23 22 21 20 */
- punpckhdq mm4, mm5 /* mm4 = 33 32 31 30 */
- movq mm1, [esi+64] /* mm1 = 34 24 14 04 */
- movq mm2, [esi+80] /* mm2 = 35 25 15 05 */
- movq mm5, [esi+96] /* mm5 = 36 26 16 06 */
- movq mm6, [esi+112] /* mm6 = 37 27 17 07 */
-
- movq mm3, mm1 /* mm3 = 34 24 14 04 */
- movq mm7, mm5 /* mm7 = 36 26 16 06 */
- punpcklwd mm1, mm2 /* mm1 = 15 14 05 04 */
- punpckhwd mm3, mm2 /* mm3 = 35 34 25 24 */
- punpcklwd mm5, mm6 /* mm5 = 17 16 07 06 */
- punpckhwd mm7, mm6 /* mm7 = 37 36 27 26 */
- movq mm2, mm1 /* mm2 = 15 14 05 04 */
- movq mm6, mm3 /* mm6 = 35 34 25 24 */
- punpckldq mm1, mm5 /* mm1 = 07 06 05 04 */
- punpckhdq mm2, mm5 /* mm2 = 17 16 15 14 */
- punpckldq mm3, mm7 /* mm3 = 27 26 25 24 */
- punpckhdq mm6, mm7 /* mm6 = 37 36 35 34 */
-
- movq mm5, [edi] /* mm5 = 03 02 01 00 */
- packuswb mm5, mm1 /* mm5 = 07 06 05 04 03 02 01 00 */
-
- movq [eax], mm5 /* write 00 01 02 03 04 05 06 07 */
- movq mm7, [edi+16] /* mm7 = 13 12 11 10 */
- packuswb mm7, mm2 /* mm7 = 17 16 15 14 13 12 11 10 */
- movq [eax+ecx], mm7 /* write 10 11 12 13 14 15 16 17 */
- packuswb mm0, mm3 /* mm0 = 27 26 25 24 23 22 21 20 */
- packuswb mm4, mm6 /* mm4 = 37 36 35 34 33 32 31 30 */
-
- movq [eax+ecx*2], mm0 /* write 20 21 22 23 24 25 26 27 */
- lea eax, [eax+ecx*4] /* mov forward the desPtr */
- movq [eax+edx], mm4 /* write 30 31 32 33 34 35 36 37 */
- add edi, 8 /* move to right four column */
- add esi, 8 /* move to right x1 */
- movq mm0, [esi] /* mm0 = 70 60 50 40 */
- movq mm1, [esi+16] /* mm1 = 71 61 51 41 */
- movq mm4, mm0 /* mm4 = 70 60 50 40 */
- punpcklwd mm0, mm1 /* mm0 = 51 50 41 40 */
- punpckhwd mm4, mm1 /* mm4 = 71 70 61 60 */
- movq mm2, [esi+32] /* mm2 = 72 62 52 42 */
- movq mm3, [esi+48] /* mm3 = 73 63 53 43 */
- movq mm5, mm2 /* mm5 = 72 62 52 42 */
- punpcklwd mm2, mm3 /* mm2 = 53 52 43 42 */
- punpckhwd mm5, mm3 /* mm5 = 73 72 63 62 */
- movq mm1, mm0 /* mm1 = 51 50 41 40 */
- punpckldq mm0, mm2 /* mm0 = 43 42 41 40 */
- movq [edi], mm0 /* write 40 41 42 43 */
- punpckhdq mm1, mm2 /* mm1 = 53 52 51 50 */
-
- movq mm0, mm4 /* mm0 = 71 70 61 60 */
- movq [edi+16], mm1 /* write 50 51 52 53 */
- punpckldq mm0, mm5 /* mm0 = 63 62 61 60 */
- punpckhdq mm4, mm5 /* mm4 = 73 72 71 70 */
- movq mm1, [esi+64] /* mm1 = 74 64 54 44 */
- movq mm2, [esi+80] /* mm2 = 75 65 55 45 */
- movq mm5, [esi+96] /* mm5 = 76 66 56 46 */
- movq mm6, [esi+112] /* mm6 = 77 67 57 47 */
-
- movq mm3, mm1 /* mm3 = 74 64 54 44 */
- movq mm7, mm5 /* mm7 = 76 66 56 46 */
- punpcklwd mm1, mm2 /* mm1 = 55 54 45 44 */
- punpckhwd mm3, mm2 /* mm3 = 75 74 65 64 */
- punpcklwd mm5, mm6 /* mm5 = 57 56 47 46 */
- punpckhwd mm7, mm6 /* mm7 = 77 76 67 66 */
- movq mm2, mm1 /* mm2 = 55 54 45 44 */
- movq mm6, mm3 /* mm6 = 75 74 65 64 */
- punpckldq mm1, mm5 /* mm1 = 47 46 45 44 */
- punpckhdq mm2, mm5 /* mm2 = 57 56 55 54 */
- punpckldq mm3, mm7 /* mm3 = 67 66 65 64 */
- punpckhdq mm6, mm7 /* mm6 = 77 76 75 74 */
-
- movq mm5, [edi] /* mm5 = 43 42 41 40 */
- packuswb mm5, mm1 /* mm5 = 47 46 45 44 43 42 41 40 */
-
- movq [eax], mm5 /* write 40 41 42 43 44 45 46 47 */
- movq mm7, [edi+16] /* mm7 = 53 52 51 50 */
- packuswb mm7, mm2 /* mm7 = 57 56 55 54 53 52 51 50 */
- movq [eax+ecx], mm7 /* write 50 51 52 53 54 55 56 57 */
- packuswb mm0, mm3 /* mm0 = 67 66 65 64 63 62 61 60 */
- packuswb mm4, mm6 /* mm4 = 77 76 75 74 73 72 71 70 */
-
- movq [eax+ecx*2], mm0 /* write 60 61 62 63 64 65 66 67 */
- lea eax, [eax+ecx*4] /* mov forward the desPtr */
- movq [eax+edx], mm4 /* write 70 71 72 73 74 75 76 77 */
-
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebp
- pop eax
- }//__asm
- Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
- Var1 += Variance12[0]+ Variance12[1]+Variance12[2]+Variance12[3];
- pbi->FragmentVariances[CurrentFrag-1] += Var1;
- Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
- Var2 += Variance22[0]+ Variance22[1]+Variance22[2]+Variance22[3];
- pbi->FragmentVariances[CurrentFrag] += Var2;
-
- CurrentFrag ++;
- }//else
-
- }//while
- }
- /****************************************************************************
- *
- * ROUTINE : DeblockNonFilteredBandNewFilter_MMX(
- *
- * INPUTS : None
- *
- * OUTPUTS : None
- *
- * RETURNS : None
- *
- * FUNCTION : Filter both horizontal and vertical edge in a band
- *
- * SPECIAL NOTES : Using Sum of abs to determine where to apply the
- * new 7 tap filter
- *
- * REFERENCE :
- *
- * ERRORS : None.
- *
- ****************************************************************************/
- void DeblockNonFilteredBandNewFilter_MMX(
- POSTPROC_INSTANCE *pbi,
- UINT8 *SrcPtr,
- UINT8 *DesPtr,
- UINT32 PlaneLineStep,
- UINT32 FragAcross,
- UINT32 StartFrag,
- UINT32 *QuantScale
- )
- {
- UINT32 j;
- UINT32 CurrentFrag=StartFrag;
- UINT32 QStep;
- UINT32 LoopFLimit;
- UINT8 *Src, *Des;
- #if defined(_WIN32_WCE)
- #pragma pack(16)
- short QStepMmx[4];
- short FLimitMmx[4];
- short LoopFLimitMmx[4];
- short Rows[80];
- short NewRows[64];
- short LoopFilteredValuesUp[4];
- short LoopFilteredValuesDown[4];
- unsigned char Variance11[8];
- unsigned char Variance21[8];
- UINT32 Var1, Var2;
- #pragma pack()
- #else
- __declspec(align(16)) short QStepMmx[4];
- __declspec(align(16)) short FLimitMmx[4];
- __declspec(align(16)) short LoopFLimitMmx[4];
- __declspec(align(16)) short Rows[80];
- __declspec(align(16)) short NewRows[64];
- __declspec(align(16)) short LoopFilteredValuesUp[4];
- __declspec(align(16)) short LoopFilteredValuesDown[4];
- __declspec(align(16)) unsigned char Variance11[8];
- __declspec(align(16)) unsigned char Variance21[8];
- UINT32 Var1, Var2;
- #endif
- QStep = QuantScale[pbi->FrameQIndex];
- QStepMmx[0] = (INT16)QStep;
- QStepMmx[1] = (INT16)QStep;
- QStepMmx[2] = (INT16)QStep;
- QStepMmx[3] = (INT16)QStep;
- LoopFLimit = DeblockLimitValuesV2[pbi->FrameQIndex];
- LoopFLimitMmx[0] = (INT16)LoopFLimit;
- LoopFLimitMmx[1] = (INT16)LoopFLimit;
- LoopFLimitMmx[2] = (INT16)LoopFLimit;
- LoopFLimitMmx[3] = (INT16)LoopFLimit;
- while(CurrentFrag < StartFrag + FragAcross )
- {
- Src=SrcPtr+8*(CurrentFrag-StartFrag);
- Des=DesPtr+8*(CurrentFrag-StartFrag);
- __asm
- {
-
- push eax
- push ebp
-
- push ecx
- push edx
- push esi
- push edi
- /* Calculate the FLimit and store FLimit and QStep */
- /* Copy the data to the intermediate buffer */
- mov eax, QStep
- xor edx, edx /* clear edx */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- movd mm5, eax
- mov eax, Src /* eax = Src */
- punpcklwd mm5, mm5
- lea esi, NewRows /* esi = NewRows */
- punpckldq mm5, mm5
-
- sub edx, ecx /* edx = - Pitch */
- movq mm6, mm5 /* Q Q Q Q */
- paddw mm6, mm5
- paddw mm6, mm5 /* 3Q3Q3Q3Q */
- packuswb mm5, mm5 /* QQQQQQQQ */
- movq QStepMmx, mm5
- psraw mm6, 2 /* F F F F */
- packuswb mm6, mm6 /* FFFFFFFF */
- lea edi, Rows /* edi = Rows */
- pxor mm7, mm7 /* Clear mm7 */
- psubb mm6, Eight128c /* Eight (F-128)s */
-
- lea eax, [eax + edx * 4 ] /* eax = Src - 4*Pitch */
- movq mm0, [eax + edx] /* mm0 = Src[-5*Pitch] */
- movq mm1, mm0 /* mm1 = mm0 */
- punpcklbw mm0, mm7 /* Lower Four -5 */
- movq mm4, mm1 /* mm4 = Src[-5*Pitch] */
- movq [FLimitMmx], mm6 /* FFFF FFFF */
-
- movq mm2, [eax] /* mm2 = Src[-4*Pitch] */
- punpckhbw mm1, mm7 /* Higher Four -5 */
-
- movq [edi], mm0 /* Write Lower Four of -5 */
- movq mm5, mm2 /* mm5 = S_4 */
-
- movq mm3, mm2 /* mm3 = S_4 */
- movq [edi+8], mm1 /* Write Higher Four of -5 */
- movq mm0, [eax + ecx] /* mm0 = Src[-3*Pitch] */
- psubusb mm5, mm4 /* S_4 - S_5 */
-
- psubusb mm4, mm2 /* S_5 - S_4 */
- punpcklbw mm2, mm7 /* Lower Four -4 */
- por mm4, mm5 /* abs(S_4-S_5) */
- movq [edi+16], mm2 /* Write Lower -4 */
- movq mm6, mm3 /* mm6 = S_4 */
- punpckhbw mm3, mm7 /* higher Four -4 */
- movq [edi+24], mm3 /* write hight -4 */
- movq mm1, mm0 /* mm1 = S_3 */
- punpcklbw mm0, mm7 /* lower four -3 */
- movq [edi+32], mm0 /* write Lower -3 */
- movq mm2, [eax + ecx *2] /* mm2 = Src[-2*Pitch] */
- movq mm5, mm1 /* mm5 = S_3 */
- psubusb mm5, mm6 /* S_3 - S_4 */
- psubusb mm6, mm1 /* S_4 - S_3 */
- por mm5, mm6 /* abs(S_4-S_3) */
- movq mm6, mm1 /* mm6 = S_3 */
-
- punpckhbw mm1, mm7 /* higher four -3 */
- movq mm3, mm2 /* mm3 = S_2 */
-
- movq [edi+40], mm1 /* write Higher -3 */
- paddusb mm4, mm5 /* abs(S_5-S_4)+abs(S_4-S_3) */
-
- movq mm5, mm2 /* mm5 = S_2 */
- psubusb mm5, mm6 /* S_2 - S_3 */
- psubusb mm6, mm2 /* S_3 - S_2 */
- por mm5, mm6 /* abs(S_3 - S_2) */
- movq mm6, mm2 /* mm6 = S_2 */
- punpcklbw mm2, mm7 /* lower four -2 */
- lea eax, [eax + ecx *4] /* eax = Src */
-
- punpckhbw mm3, mm7 /* higher four -2 */
- movq mm0, [eax + edx] /* mm2 = Src[-Pitch] */
- movq [edi+48], mm2 /* lower -2 */
-
- paddusb mm4, mm5 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2) */
- movq mm5, mm0 /* mm5 = S_1 */
- movq [edi+56], mm3 /* higher -2 */
- movq mm1, mm0 /* mm1 = S_1 */
- psubusb mm5, mm6 /* S_1 - S_2 */
- psubusb mm6, mm1 /* S_2 - S_1 */
-
- punpcklbw mm0, mm7 /* lower -1 */
- por mm5, mm6 /* abs(S_2 - S_1) */
- movq [edi+64], mm0 /* Lower -1 */
- movq mm6, mm1 /* mm6 = S_1 */
- punpckhbw mm1, mm7 /* Higher -1 */
- movq [edi+72], mm1 /* Higher -1 */
- movq mm0, [eax] /* mm0 = Src[0] */
- paddusb mm4, mm5 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2 - S_1) */
- movq [Variance11], mm4; /* save the variance */
- movq mm5, FLimitMmx /* mm5 = FFFF FFFF */
- psubb mm4, Eight128c /* abs(..) - 128 */
- pcmpgtb mm5, mm4 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2 - S_1) < FLimit ? */
-
- movq mm1, mm0 /* mm1 = S0 */
- punpcklbw mm0, mm7 /* lower 0 */
-
- movq mm4, mm1 /* mm4 = S0 */
- movq [edi+80], mm0 /* write lower 0 */
- psubusb mm4, mm6 /* S0 - S_1 */
- psubusb mm6, mm1 /* S_1 - S0 */
- movq mm0, [eax + ecx] /* mm0 = Src[Pitch] */
- movq mm3, QStepMmx /* mm3 = QQQQQQQQQ */
- por mm4, mm6 /* abs(S0 - S_1) */
- movq mm6, mm1 /* mm6 = S0 */
-
- psubb mm3, Eight128c /* -128 for using signed compare*/
- psubb mm4, Eight128c /* -128 for using signed compare*/
- pcmpgtb mm3, mm4 /* abs(S0-S_1) < QStep */
- punpckhbw mm1, mm7 /* higher 0 */
-
- movq mm4, mm0 /* mm4 = S1 */
- pand mm5, mm3 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2 - S_1) < FLimit &&
- abs(S0-S_1) < QStep */
- movq [edi+88], mm1 /* write higher 0 */
-
- movq mm1, mm0 /* mm1 = S1 */
- psubusb mm4, mm6 /* S1 - S0 */
- punpcklbw mm0, mm7 /* lower 1 */
- psubusb mm6, mm1 /* S0 - S1 */
- movq [edi+96], mm0 /* write lower 1 */
- por mm4, mm6 /* mm4 = abs(S1-S0) */
- movq mm2, [eax + ecx *2 ] /* mm2 = Src[2*Pitch] */
- movq mm6, mm1 /* mm6 = S1 */
- lea eax, [eax + ecx *4] /* eax = Src + 4 * Pitch */
- punpckhbw mm1, mm7 /* higher 1 */
-
-
- movq mm0, mm2 /* mm0 = S2 */
- movq [edi+104], mm1 /* wirte higher 1 */
- movq mm3, mm0 /* mm3 = S2 */
- movq mm1, [eax + edx ] /* mm4 = Src[3*pitch] */
-
- punpcklbw mm2, mm7 /* lower 2 */
- psubusb mm3, mm6 /* S2 - S1 */
-
- psubusb mm6, mm0 /* S1 - S2 */
- por mm3, mm6 /* abs(S1-S2) */
- movq [edi+112], mm2 /* write lower 2 */
- movq mm6, mm0 /* mm6 = S2 */
- punpckhbw mm0, mm7 /* higher 2 */
- paddusb mm4, mm3 /* abs(S0-S1)+abs(S1-S2) */
-
- movq mm2, mm1 /* mm2 = S3 */
- movq mm3, mm1 /* mm3 = S3 */
-
- movq [edi+120], mm0 /* write higher 2 */
- punpcklbw mm1, mm7 /* Low 3 */
- movq mm0, [eax] /* mm0 = Src[4*pitch] */
- psubusb mm3, mm6 /* S3 - S2 */
- psubusb mm6, mm2 /* S2 - S3 */
- por mm3, mm6 /* abs(S2-S3) */
-
- movq [edi+128], mm1 /* low 3 */
- movq mm6, mm2 /* mm6 = S3 */
-
- punpckhbw mm2, mm7 /* high 3 */
- paddusb mm4, mm3 /* abs(S0-S1)+abs(S1-S2)+abs(S2-S3) */
- movq mm1, mm0 /* mm1 = S4 */
- movq mm3, mm0 /* mm3 = S4 */
-
- movq [edi+136], mm2 /* high 3 */
- punpcklbw mm0, mm7 /* low 4 */
-
- psubusb mm3, mm6 /* S4 - S3 */
- movq [edi+144], mm0 /* low 4 */
-
- psubusb mm6, mm1 /* S3 - S4 */
- por mm3, mm6 /* abs(S3-S4) */
- punpckhbw mm1, mm7 /* high 4 */
- paddusb mm4, mm3 /* abs((S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4) */
-
- movq [Variance21], mm4; /* save the variance */
- movq mm6, FLimitMmx /* mm6 = FFFFFFFFF */
- psubb mm4, Eight128c /* abs(..) - 128 */
- movq [edi+152], mm1 /* high 4 */
-
- pcmpgtb mm6, mm4 /* abs((S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4)<FLimit? */
- pand mm6, mm5 /* Flag */
- /* done with copying everything to intermediate buffer */
- /* mm7 = 0, mm6 = Flag */
- movq mm0, mm6
- movq mm7, mm6
-
- punpckhbw mm0, mm6
- punpcklbw mm7, mm6
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 and mm7 now are in use */
-
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
- /* Let's do the filtering now */
- /* p1 = Src[-5] */
- /* p2 = Src[+4] */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
-
- movq mm3, [edi] /* mm3 = [-5] */
- movq mm2, [edi+144] /* mm2 = [4] */
- movq mm1, mm3 /* p1 = [-4] */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- /* Des[-w4] = (((sum + x1) >> 3; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm4, mm3 /* mm4 = mm3 */
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
- psraw mm4, 3 /* mm4 >>=4 */
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)>>3 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])>>3 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x4 */
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)>>3 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x5 */
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)>>3 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- psraw mm4, 3 /* >>=3 */
- psubw mm4, mm5 /* -=x6 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x6 */
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = (sum+x7)>>3 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- psraw mm4, 3 /* >>=3 */
- psubw mm4, mm5 /* -=x7 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x7 */
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)>>3 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x8 */
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x8 */
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with left four columns */
- /* now do the righ four columns */
-
- add edi, 8 /* shift to right four column */
- add esi, 8 /* shift to right four column */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 now are in use */
-
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm4, mm4 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
-
-
- /* Let's do the filtering now */
- /* p1 = Src[-5] */
- /* p2 = Src[+4] */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
-
- movq mm3, [edi] /* mm3 = [-5] */
- movq mm2, [edi+144] /* mm2 = [4] */
-
- movq mm1, mm3 /* p1 = [-4] */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
- movq mm4, [edi+16] /* mm4 = x1 */
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
- /* Des[-w4] = (((sum + x1) >> 3; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
- movq mm4, mm3 /* mm4 = mm3 */
- movq mm5, [edi+16] /* mm5 = x1 */
- paddw mm4, mm5 /* mm4 = sum+x1 */
- psraw mm4, 3 /* mm4 >>=4 */
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)>>3 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])>>3 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
- psubw mm4, mm5 /* new value - old value */
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x4 */
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)>>3 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x5 */
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)>>3 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- psraw mm4, 3 /* >>=3 */
- psubw mm4, mm5 /* -=x6 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x6 */
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = (sum+x7)>>3 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- psraw mm4, 3 /* >>=3 */
- psubw mm4, mm5 /* -=x7 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x7 */
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)>>3 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x8 */
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x8 */
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with right four column */
- add edi, 8 /* shift edi to point x1 */
- sub esi, 8 /* shift esi back to x1 */
- mov ebp, Des /* the destination */
- lea ebp, [ebp + edx *4] /* point to des[-w4] */
-
- movq mm0, [esi]
- packuswb mm0, [esi + 8]
-
- movq [ebp], mm0 /* write des[-w4] */
-
- movq mm1, [esi + 16]
- packuswb mm1, [esi + 24]
-
- movq [ebp+ecx ], mm1 /* write des[-w3] */
-
- movq mm2, [esi + 32]
- packuswb mm2, [esi + 40]
-
- movq [ebp+ecx*2 ], mm2 /* write des[-w2] */
-
- movq mm3, [esi + 48]
- packuswb mm3, [esi + 56]
-
- lea ebp, [ebp+ecx*4] /* point to des[0] */
- movq [ebp+edx], mm3 /* write des[-w1] */
-
- movq mm0, [esi + 64]
- packuswb mm0, [esi + 72]
-
- movq [ebp ], mm0 /* write des[0] */
-
- movq mm1, [esi + 80]
- packuswb mm1, [esi + 88]
-
- movq [ebp+ecx], mm1 /* write des[w1] */
-
- movq mm2, [esi + 96]
- packuswb mm2, [esi + 104]
-
- movq [ebp+ecx*2], mm2 /* write des[w2] */
-
- movq mm3, [esi + 112]
- packuswb mm3, [esi + 120]
-
- lea ebp, [ebp+ecx*2] /* point to des[w4] */
- movq [ebp+ecx], mm3 /* write des[w3] */
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebp
- pop eax
-
- } /* end of the macro */
-
- Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
- Var1 += Variance11[4]+ Variance11[5]+Variance11[6]+Variance11[7];
- pbi->FragmentVariances[CurrentFrag] += Var1;
- Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
- Var2 += Variance21[4]+ Variance21[5]+Variance21[6]+Variance21[7];
- pbi->FragmentVariances[CurrentFrag + FragAcross] += Var2;
- if(CurrentFrag==StartFrag)
- CurrentFrag++;
- else
- {
- Des=DesPtr-8*PlaneLineStep+8*(CurrentFrag-StartFrag);
- Src=Des;
- for( j=0; j<8;j++)
- {
- Rows[j] = (short) (Src[-5+j*PlaneLineStep]);
- Rows[72+j] = (short)(Src[4+j*PlaneLineStep]);
- }
- __asm
- {
- /* Save the registers */
- push eax
- push ebp
- /* Calculate the FLimit and store FLimit and QStep */
- mov eax, QStep /* get QStep */
- movd mm0, eax /* mm0 = 0, 0, 0, Q */
- push ecx
-
- punpcklwd mm0, mm0 /* mm0 = 0, 0, Q, Q */
- punpckldq mm0, mm0 /* mm0 = Q, Q, Q, Q */
- push edx
-
- movq mm1, mm0 /* mm1 = Q, Q, Q, Q */
- paddw mm1, mm0
-
- push esi
- paddw mm1, mm0
- packuswb mm0, mm0
-
- push edi
-
- movq QStepMmx, mm0 /* write the Q step */
- psraw mm1, 2 /* mm1 = FLimit */
-
- packuswb mm1, mm1 /* mm1 = FFFF FFFF */
- psubb mm1, Eight128c /* F-128 */
- movq [FLimitMmx], mm1 /* Save FLimit */
- /* setup the pointers to data */
- mov eax, Src /* eax = Src */
- xor edx, edx /* clear edx */
-
- sub eax, 4 /* eax = Src-4 */
- lea esi, NewRows /* esi = NewRows */
- lea edi, Rows /* edi = Rows */
- mov ecx, PlaneLineStep /* ecx = Pitch */
- sub edx, ecx /* edx = -Pitch */
- /* Get the data to the intermediate buffer */
- movq mm0, [eax] /* mm0 = 07 06 05 04 03 02 01 00 */
- movq mm1, [eax+ecx] /* mm1 = 17 16 15 14 13 12 11 10 */
- movq mm2, [eax+ecx*2] /* mm2 = 27 26 25 24 23 22 21 20 */
- lea eax, [eax+ecx*4] /* Go down four Rows */
- movq mm3, [eax+edx] /* mm3 = 37 36 35 34 33 32 31 30 */
- movq mm4, mm0 /* mm4 = 07 06 05 04 03 02 01 00 */
-
- punpcklbw mm0, mm1 /* mm0 = 13 03 12 02 11 01 10 00 */
- punpckhbw mm4, mm1 /* mm4 = 17 07 16 06 15 05 14 04 */
- movq mm5, mm2 /* mm5 = 27 26 25 24 23 22 21 20 */
- punpcklbw mm2, mm3 /* mm2 = 33 23 32 22 31 21 30 20 */
- punpckhbw mm5, mm3 /* mm5 = 37 27 36 26 35 25 34 24 */
- movq mm1, mm0 /* mm1 = 13 03 12 02 11 01 10 00 */
- punpcklwd mm0, mm2 /* mm0 = 31 21 11 01 30 20 10 00 */
- punpckhwd mm1, mm2 /* mm1 = 33 23 13 03 32 22 12 02 */
-
- movq mm2, mm4 /* mm2 = 17 07 16 06 15 05 14 04 */
- punpckhwd mm4, mm5 /* mm4 = 37 27 17 07 36 26 16 06 */
- punpcklwd mm2, mm5 /* mm2 = 35 25 15 05 34 24 14 04 */
- pxor mm7, mm7 /* clear mm7 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 30 20 10 00 */
- movq [edi+16], mm0 /* write 00 10 20 30 */
- punpckhbw mm5, mm7 /* mm5 = 31 21 11 01 */
- movq mm0, mm1 /* mm0 =33 23 13 03 32 22 12 02 */
- movq [edi+32], mm5 /* write 01 11 21 31 */
-
- punpcklbw mm1, mm7 /* mm1 = 32 22 12 02 */
- punpckhbw mm0, mm7 /* mm0 = 33 23 12 03 */
- movq [edi+48], mm1 /* write 02 12 22 32 */
- movq mm3, mm2 /* mm3 = 35 25 15 05 34 24 14 04 */
-
- movq mm5, mm4 /* mm5 = 37 27 17 07 36 26 16 06 */
- movq [edi+64], mm0 /* write 03 13 23 33 */
- punpcklbw mm2, mm7 /* mm2 = 34 24 14 04 */
- punpckhbw mm3, mm7 /* mm3 = 35 25 15 05 */
- movq [edi+80], mm2 /* write 04 14 24 34 */
- punpcklbw mm4, mm7 /* mm4 = 36 26 16 06 */
- punpckhbw mm5, mm7 /* mm5 = 37 27 17 07 */
- movq [edi+96], mm3 /* write 05 15 25 35 */
-
- movq mm0, [eax] /* mm0 = 47 46 45 44 43 42 41 40 */
- movq mm1, [eax + ecx ] /* mm1 = 57 56 55 54 53 52 51 50 */
- movq [edi+112], mm4 /* write 06 16 26 37 */
- movq mm2, [eax+ecx*2] /* mm2 = 67 66 65 64 63 62 61 60 */
- lea eax, [eax+ ecx*4] /* Go down four rows */
- movq [edi+128], mm5 /* write 07 17 27 37 */
- movq mm4, mm0 /* mm4 = 47 46 45 44 43 42 41 40 */
- movq mm3, [eax+edx] /* mm3 = 77 76 75 74 73 72 71 70 */
- punpcklbw mm0, mm1 /* mm0 = 53 43 52 42 51 41 50 40 */
- punpckhbw mm4, mm1 /* mm4 = 57 57 56 46 55 45 54 44 */
- movq mm5, mm2 /* mm5 = 67 66 65 64 63 62 61 60 */
- punpcklbw mm2, mm3 /* mm2 = 73 63 72 62 71 61 70 60 */
- punpckhbw mm5, mm3 /* mm5 = 77 67 76 66 75 65 74 64 */
- movq mm1, mm0 /* mm1 = 53 43 52 42 51 41 50 40 */
- punpcklwd mm0, mm2 /* mm0 = 71 61 51 41 70 60 50 40 */
- punpckhwd mm1, mm2 /* mm1 = 73 63 53 43 72 62 52 42 */
-
- movq mm2, mm4 /* mm2 = 57 57 56 46 55 45 54 44 */
- punpckhwd mm4, mm5 /* mm4 = 77 67 57 47 76 66 56 46 */
- punpcklwd mm2, mm5 /* mm2 = 75 65 55 45 74 64 54 44 */
- movq mm5, mm0 /* make a copy */
- punpcklbw mm0, mm7 /* mm0 = 70 60 50 40 */
- movq [edi+24], mm0 /* write 40 50 60 70 */
- punpckhbw mm5, mm7 /* mm5 = 71 61 51 41 */
- movq mm0, mm1 /* mm0 = 73 63 53 43 72 62 52 42 */
- movq [edi+40], mm5 /* write 41 51 61 71 */
-
- punpcklbw mm1, mm7 /* mm1 = 72 62 52 42 */
- punpckhbw mm0, mm7 /* mm0 = 73 63 53 43 */
- movq [edi+56], mm1 /* write 42 52 62 72 */
- movq mm3, mm2 /* mm3 = 75 65 55 45 74 64 54 44 */
-
- movq mm5, mm4 /* mm5 = 77 67 57 47 76 66 56 46 */
- movq [edi+72], mm0 /* write 43 53 63 73 */
- punpcklbw mm2, mm7 /* mm2 = 74 64 54 44 */
- punpckhbw mm3, mm7 /* mm3 = 75 65 55 45 */
- movq [edi+88], mm2 /* write 44 54 64 74 */
- punpcklbw mm4, mm7 /* mm4 = 76 66 56 46 */
- punpckhbw mm5, mm7 /* mm5 = 77 67 57 47 */
- movq [edi+104], mm3 /* write 45 55 65 75 */
-
- movq [edi+120], mm4 /* write 46 56 66 76 */
- movq [edi+136], mm5 /* write 47 57 67 77 */
- /* Now, compute the variances for Pixel 1-4 and 5-8 */
-
- movq mm0, [edi] /* S_5 */
- movq mm1, [edi+16] /* S_4 */
- movq mm2, [edi+32] /* S_3 */
- packuswb mm0, [edi+8]
- packuswb mm1, [edi+24]
- packuswb mm2, [edi+40]
- movq mm3, [edi+48] /* S_2 */
- movq mm4, [edi+64] /* S_1 */
- packuswb mm3, [edi+56]
- packuswb mm4, [edi+72]
- movq mm5, mm1 /* S_4 */
- movq mm6, mm2 /* S_3 */
- psubusb mm5, mm0 /* S_4 - S_5 */
- psubusb mm0, mm1 /* S_5 - S_4 */
- por mm0, mm5 /* abs(S_5-S_4) */
- psubusb mm6, mm1 /* S_3 - S_4 */
- psubusb mm1, mm2 /* S_4 - S_3 */
- movq mm5, mm3 /* S_2 */
- por mm1, mm6 /* abs(S_4-S_3) */
- psubusb mm5, mm2 /* S_2 - S_3 */
-
- psubusb mm2, mm3 /* S_3 - S_2 */
- movq mm6, mm4 /* S_1 */
- por mm2, mm5 /* abs(S_3-S_2) */
- psubusb mm6, mm3 /* S_1 - S_2 */
- psubusb mm3, mm4 /* S_2 - S_1 */
- por mm3, mm6 /* abs(S_2-S_1) */
- paddusb mm0, mm1 /* abs(S_5-S_4)+abs(S_4-S_3) */
- paddusb mm2, mm3 /* abs(S_3-S_2)+abs(S_2-S_1) */
- movq mm7, FLimitMmx /* FFFFF FFFF */
- paddusb mm0, mm2 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2-S_1) */
-
- movq [Variance11], mm0 /* Save the variance */
- movq mm6, mm4 /* S_1 */
- psubb mm0, Eight128c /* abs(..) - 128 */
- pcmpgtb mm7, mm0 /* abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2-S_1)<? */
-
- movq mm5, [edi+80] /* S0 */
- movq mm1, [edi+96] /* S1 */
- movq mm2, [edi+112] /* S2 */
- packuswb mm5, [edi+88]
- packuswb mm1, [edi+104]
- packuswb mm2, [edi+120]
- movq mm3, [edi+128] /* S3 */
- movq mm4, [edi+144] /* S4 */
- packuswb mm3, [edi+136]
- packuswb mm4, [edi+152]
- movq mm0, mm5 /* S0 */
- psubusb mm5, mm6 /* S0-S_1 */
- psubusb mm6, mm0 /* S_1-S0 */
- por mm5, mm6 /* abs(S_1-S0) */
- movq mm6, QStepMmx /* QQQQ QQQQ */
- psubb mm5, Eight128c /* -128 for using signed compare*/
- psubb mm6, Eight128c /* -128 for using signed compare*/
- pcmpgtb mm6, mm5 /* abs(S_1-S0)<QStep? */
- movq mm5, mm1 /* S1 */
- pand mm7, mm6 /* abs(S_1-S0)<QStep &&
- abs(S_5-S_4)+abs(S_4-S_3)+abs(S_3-S_2)+abs(S_2-S_1)<FLimit? */
- movq mm6, mm2 /* S2 */
- psubusb mm5, mm0 /* S1 - S0 */
- psubusb mm0, mm1 /* S0 - S1*/
- por mm0, mm5 /* abs(S0-S1) */
- psubusb mm6, mm1 /* S2 - S1 */
- psubusb mm1, mm2 /* S1 - S2*/
- movq mm5, mm3 /* S3 */
- por mm1, mm6 /* abs(S1-S2) */
- psubusb mm5, mm2 /* S3 - S2 */
-
- psubusb mm2, mm3 /* S2 - S3 */
- movq mm6, mm4 /* S4 */
- por mm2, mm5 /* abs(S2-S3) */
- psubusb mm6, mm3 /* S4 - S3 */
- psubusb mm3, mm4 /* S3 - S4 */
- por mm3, mm6 /* abs(S3-S4) */
- paddusb mm0, mm1 /* abs(S0-S1)+abs(S1-S2) */
- paddusb mm2, mm3 /* abs(S2-S3)+abs(S3-S4) */
- movq mm6, FLimitMmx /* FFFFF FFFF */
- paddusb mm0, mm2 /* abs(S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4) */
-
- movq [Variance21], mm0 /* Save the variance */
-
- psubb mm0, Eight128c /* abs(..) - 128 */
- pcmpgtb mm6, mm0 /* abs(S0-S1)+abs(S1-S2)+abs(S2-S3)+abs(S3-S4)<FLimit */
- pand mm6, mm7 /* Flag */
- movq mm0, mm6
- movq mm7, mm6
-
- punpckhbw mm0, mm6
- punpcklbw mm7, mm6
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 and mm7 now are in use */
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
- /* Let's do the filtering now */
- /* p1 = Src[-5] */
- /* p2 = Src[+4] */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
-
- movq mm3, [edi] /* mm3 = [-5] */
- movq mm2, [edi+144] /* mm2 = [4] */
-
- movq mm1, mm3 /* p1 = [-4] */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- movq mm4, [edi+16] /* mm4 = x1 */
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
-
- /* Des[-w4] = (((sum + x1) >> 3; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm4, mm3 /* mm4 = mm3 */
- movq mm5, [edi+16] /* mm5 = x1 */
-
- paddw mm4, mm5 /* mm4 = sum+x1 */
- psraw mm4, 3 /* mm4 >>=3 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)>>3 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm7 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])>>3 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
-
- psubw mm4, mm5 /* new value - old value */
- pand mm4, mm7 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
-
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x4 */
-
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)>>3 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
-
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
-
- pand mm4, mm7 /* and flag */
- paddw mm4, mm5 /* += x5 */
-
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)>>3 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x6 */
- pand mm4, mm7 /* and flag */
-
- paddw mm4, mm5 /* += x6 */
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = (sum+x7)>>3 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x7 */
- pand mm4, mm7 /* and flag */
-
- paddw mm4, mm5 /* += x7 */
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)>>3 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x8 */
- pand mm4, mm7 /* and flag */
-
- paddw mm4, mm5 /* += x8 */
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with left four columns */
- /* now do the righ four columns */
- add edi, 8 /* shift to right four column */
- add esi, 8 /* shift to right four column */
-
- /* mm0 = Variance 1< Flimit && Variance 2<Flimit && abs(4-5)<QStep */
- /* mm0 now are in use */
- /* find the loop filtered values for the pixels on block boundary */
- movq mm1, LoopFLimitMmx; /* Get the Flimit values for loop filter */
- movq mm3, [edi + 48] /* mm3 = x3 = p[-2] */
- movq mm4, [edi + 64] /* mm4 = x4 = p[-1] */
- movq mm5, [edi + 80] /* mm5 = x5 = p[ 0] */
- movq mm6, [edi + 96] /* mm6 = x6 = p[ 1] */
- psubw mm5, mm4 /* mm5 = p[ 0] - p[-1] */
- psubw mm3, mm6 /* mm3 = p[-2] - p[ 1] */
- movq mm4, mm5 /* make a copy */
- paddw mm4, mm5 /* 2 * ( p[0] - p[-1] ) */
- paddw mm3, FourFours /* mm3 + 4 */
- paddw mm5, mm4 /* 3 * ( p[0] - p[-1] ) */
- paddw mm3, mm5 /* Filtval before shift */
- psraw mm3, 3 /* FiltVal */
- movq mm2, mm3 /* make a copy */
- psraw mm3, 15 /* FFFF->Neg, 0000->Pos */
- pxor mm2, mm3
- psubsw mm2, mm3 /* mm2 = abs(FiltVal) */
- por mm3, FourOnes /* -1 and 1 for + and - */
- movq mm4, mm1 /* make a copy of Flimit */
- psubw mm1, mm2 /* mm1= Flimit - abs(FiltVal) */
- movq mm5, mm1 /* copy Flimit - abs(FiltVal) */
- psraw mm1, 15 /* FFFF or 0000 */
- pxor mm5, mm1
- psubsw mm5, mm1 /* abs(Flimit - abs(FiltVal)) */
- psubusw mm4, mm5 /* Flimit-abs(Flimit - abs(FiltVal)) */
- pmullw mm4, mm3 /* get the sign back */
- movq mm1, [edi+64] /* p[-1] */
- movq mm2, [edi+80] /* p[0] */
-
- paddw mm1, mm4 /* p[-1] + NewFiltVal */
- psubw mm2, mm4 /* p[0] - NewFiltVal */
- pxor mm6, mm6 /* clear mm6 */
-
- packuswb mm1, mm1 /* clamping */
- packuswb mm2, mm2 /* clamping */
- punpcklbw mm1, mm6 /* unpack to word */
- movq LoopFilteredValuesUp, mm1 /* save the values */
- punpcklbw mm2, mm6 /* unpack to word */
- movq LoopFilteredValuesDown, mm2 /* save the values */
-
-
- /* Let's do the filtering now */
- /* p1 = Src[-5] */
- /* p2 = Src[+4] */
- /* sum = p1 + p1 + p1 + x1 + x2 + x3 + x4 + 4; */
-
- movq mm3, [edi] /* mm3 = [-5] */
- movq mm2, [edi+144] /* mm2 = [4] */
-
- movq mm1, mm3 /* p1 = [-4] */
- paddw mm3, mm3 /* mm3 = p1 + p1 */
-
- movq mm4, [edi+16] /* mm4 = x1 */
- paddw mm3, mm1 /* mm3 = p1 + p1 + p1 */
-
- paddw mm3, [edi+32] /* mm3 = p1+p1+p1+ x2 */
- paddw mm4, [edi+48] /* mm4 = x1+x3 */
-
- paddw mm3, [edi+64] /* mm3 += x4 */
- paddw mm4, FourFours /* mm4 = x1 + x3 + 4 */
-
- paddw mm3, mm4 /* mm3 = 3*p1+x1+x2+x3+x4+4 */
-
- /* Des[-w4] = (((sum + x1) >> 3; */
- /* Des[-w4] = Src[-w4]; */
- /* which is equivalent to Src[-w4] + flag * ( newvalue - Src[-w4] */
-
- movq mm4, mm3 /* mm4 = mm3 */
- movq mm5, [edi+16] /* mm5 = x1 */
-
- paddw mm4, mm5 /* mm4 = sum+x1 */
- psraw mm4, 3 /* mm4 >>=4 */
-
- psubw mm4, mm5 /* New Value - old Value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi], mm4 /* Write new x1 */
-
- /* sum += x5 -p1 */
- /* Des[-w3]=((sum+x2)>>3 */
-
- movq mm5, [edi+32] /* mm5= x2 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+80] /* sum=sum+x5 */
- movq mm4, mm5 /* copy sum */
-
- paddw mm4, mm3 /* mm4=sum+x2 */
- psraw mm4, 3 /* mm4=((sum+x2)<<1-x5+x6)>>4 */
- psubw mm4, mm5 /* new value - old value */
-
- pand mm4, mm0 /* And the flag */
- paddw mm4, mm5 /* add the old value back */
-
- movq [esi+16], mm4 /* write new x2 */
-
- /* sum += x6 - p1 */
- /* Des[-w2]=((sum+x[3])>>3 */
-
- movq mm5, [edi+48] /* mm5= x3 */
- psubw mm3, mm1 /* sum=sum-p1 */
-
- paddw mm3, [edi+96] /* sum=sum+x6 */
- movq mm4, mm5 /* copy x3 */
-
- paddw mm4, mm3 /* mm4=sum+x3 */
- psraw mm4, 3 /* mm4=((sum+x3)<<1-x6+x7)>>4 */
-
- psubw mm4, mm5 /* new value - old value */
- pand mm4, mm0 /* And the flag */
-
- paddw mm4, mm5 /* add the old value back */
- movq [esi+32], mm4 /* write new x3 */
-
- /* sum += x7 - p1 */
- /* Des[-w1]=((sum+x4)<<1+p1-x1-x7+x8]>>4 */
-
- movq mm5, [edi+64] /* mm5 = x4 */
- psubw mm3, mm1 /* sum = sum-p1 */
-
- paddw mm3, [edi+112] /* sum = sum+x7 */
- movq mm4, mm5 /* mm4 = x4 */
-
- paddw mm4, mm3 /* mm4 = sum + x4 */
- movq mm5, LoopFilteredValuesUp/* Read the loopfiltered value of x4 */
-
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x4 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x4 */
-
- movq [esi+48], mm4 /* write new x4 */
-
- /* sum+= x8-x1 */
- /* Des[0]=((sum+x5)>>3 */
-
- movq mm5, [edi+80] /* mm5 = x5 */
- psubw mm3, [edi+16] /* sum -= x1 */
-
- paddw mm3, [edi+128] /* sub += x8 */
- movq mm4, mm5 /* mm4 = x5 */
-
- paddw mm4, mm3 /* mm4= sum+x5 */
- movq mm5, LoopFilteredValuesDown/* Read the loopfiltered value of x4 */
-
- psraw mm4, 3 /* >>=4 */
- psubw mm4, mm5 /* -=x5 */
-
- pand mm4, mm0 /* and flag */
- paddw mm4, mm5 /* += x5 */
-
- movq [esi+64], mm4 /* write new x5 */
-
- /* sum += p2 - x2 */
- /* Des[w1] = ((sum+x6)>>3 */
-
- movq mm5, [edi+96] /* mm5 = x6 */
- psubw mm3, [edi+32] /* -= x2 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x6 */
-
- paddw mm4, mm3 /* mm4 = sum+x6 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x6 */
- pand mm4, mm0 /* and flag */
-
- paddw mm4, mm5 /* += x6 */
- movq [esi+80], mm4 /* write new x6 */
-
- /* sum += p2 - x3 */
- /* Des[w2] = (sum+x7)>>3 */
-
- movq mm5, [edi+112] /* mm5 = x7 */
- psubw mm3, [edi+48] /* -= x3 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x7 */
-
- paddw mm4, mm3 /* mm4 = sum+x7 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x7 */
- pand mm4, mm0 /* and flag */
-
- paddw mm4, mm5 /* += x7 */
- movq [esi+96], mm4 /* write new x7 */
-
- /* sum += p2 - x4 */
- /* Des[w3] = ((sum+x8)>>3 */
-
- movq mm5, [edi+128] /* mm5 = x8 */
- psubw mm3, [edi+64] /* -= x4 */
-
- paddw mm3, mm2 /* += p2 */
- movq mm4, mm5 /* mm4 = x8 */
-
- paddw mm4, mm3 /* mm4 = sum+x8 */
- psraw mm4, 3 /* >>=3 */
-
- psubw mm4, mm5 /* -=x8 */
- pand mm4, mm0 /* and flag */
-
- paddw mm4, mm5 /* += x8 */
- movq [esi+112], mm4 /* write new x8 */
-
- /* done with right four column */
- /* transpose */
- mov eax, Des /* the destination */
- add edi, 8 /* shift edi to point x1 */
- sub esi, 8 /* shift esi back to left x1 */
- sub eax, 4
- movq mm0, [esi] /* mm0 = 30 20 10 00 */
- movq mm1, [esi+16] /* mm1 = 31 21 11 01 */
- movq mm4, mm0 /* mm4 = 30 20 10 00 */
- punpcklwd mm0, mm1 /* mm0 = 11 10 01 00 */
- punpckhwd mm4, mm1 /* mm4 = 31 30 21 20 */
- movq mm2, [esi+32] /* mm2 = 32 22 12 02 */
- movq mm3, [esi+48] /* mm3 = 33 23 13 03 */
- movq mm5, mm2 /* mm5 = 32 22 12 02 */
- punpcklwd mm2, mm3 /* mm2 = 13 12 03 02 */
- punpckhwd mm5, mm3 /* mm5 = 33 32 23 22 */
- movq mm1, mm0 /* mm1 = 11 10 01 00 */
- punpckldq mm0, mm2 /* mm0 = 03 02 01 00 */
- movq [edi], mm0 /* write 00 01 02 03 */
- punpckhdq mm1, mm2 /* mm1 = 13 12 11 10 */
-
- movq mm0, mm4 /* mm0 = 31 30 21 20 */
- movq [edi+16], mm1 /* write 10 11 12 13 */
- punpckldq mm0, mm5 /* mm0 = 23 22 21 20 */
- punpckhdq mm4, mm5 /* mm4 = 33 32 31 30 */
- movq mm1, [esi+64] /* mm1 = 34 24 14 04 */
- movq mm2, [esi+80] /* mm2 = 35 25 15 05 */
- movq mm5, [esi+96] /* mm5 = 36 26 16 06 */
- movq mm6, [esi+112] /* mm6 = 37 27 17 07 */
-
- movq mm3, mm1 /* mm3 = 34 24 14 04 */
- movq mm7, mm5 /* mm7 = 36 26 16 06 */
- punpcklwd mm1, mm2 /* mm1 = 15 14 05 04 */
- punpckhwd mm3, mm2 /* mm3 = 35 34 25 24 */
- punpcklwd mm5, mm6 /* mm5 = 17 16 07 06 */
- punpckhwd mm7, mm6 /* mm7 = 37 36 27 26 */
- movq mm2, mm1 /* mm2 = 15 14 05 04 */
- movq mm6, mm3 /* mm6 = 35 34 25 24 */
- punpckldq mm1, mm5 /* mm1 = 07 06 05 04 */
- punpckhdq mm2, mm5 /* mm2 = 17 16 15 14 */
- punpckldq mm3, mm7 /* mm3 = 27 26 25 24 */
- punpckhdq mm6, mm7 /* mm6 = 37 36 35 34 */
-
- movq mm5, [edi] /* mm5 = 03 02 01 00 */
- packuswb mm5, mm1 /* mm5 = 07 06 05 04 03 02 01 00 */
-
- movq [eax], mm5 /* write 00 01 02 03 04 05 06 07 */
- movq mm7, [edi+16] /* mm7 = 13 12 11 10 */
- packuswb mm7, mm2 /* mm7 = 17 16 15 14 13 12 11 10 */
- movq [eax+ecx], mm7 /* write 10 11 12 13 14 15 16 17 */
- packuswb mm0, mm3 /* mm0 = 27 26 25 24 23 22 21 20 */
- packuswb mm4, mm6 /* mm4 = 37 36 35 34 33 32 31 30 */
-
- movq [eax+ecx*2], mm0 /* write 20 21 22 23 24 25 26 27 */
- lea eax, [eax+ecx*4] /* mov forward the desPtr */
- movq [eax+edx], mm4 /* write 30 31 32 33 34 35 36 37 */
- add edi, 8 /* move to right four column */
- add esi, 8 /* move to right x1 */
- movq mm0, [esi] /* mm0 = 70 60 50 40 */
- movq mm1, [esi+16] /* mm1 = 71 61 51 41 */
- movq mm4, mm0 /* mm4 = 70 60 50 40 */
- punpcklwd mm0, mm1 /* mm0 = 51 50 41 40 */
- punpckhwd mm4, mm1 /* mm4 = 71 70 61 60 */
- movq mm2, [esi+32] /* mm2 = 72 62 52 42 */
- movq mm3, [esi+48] /* mm3 = 73 63 53 43 */
- movq mm5, mm2 /* mm5 = 72 62 52 42 */
- punpcklwd mm2, mm3 /* mm2 = 53 52 43 42 */
- punpckhwd mm5, mm3 /* mm5 = 73 72 63 62 */
- movq mm1, mm0 /* mm1 = 51 50 41 40 */
- punpckldq mm0, mm2 /* mm0 = 43 42 41 40 */
- movq [edi], mm0 /* write 40 41 42 43 */
- punpckhdq mm1, mm2 /* mm1 = 53 52 51 50 */
-
- movq mm0, mm4 /* mm0 = 71 70 61 60 */
- movq [edi+16], mm1 /* write 50 51 52 53 */
- punpckldq mm0, mm5 /* mm0 = 63 62 61 60 */
- punpckhdq mm4, mm5 /* mm4 = 73 72 71 70 */
- movq mm1, [esi+64] /* mm1 = 74 64 54 44 */
- movq mm2, [esi+80] /* mm2 = 75 65 55 45 */
- movq mm5, [esi+96] /* mm5 = 76 66 56 46 */
- movq mm6, [esi+112] /* mm6 = 77 67 57 47 */
-
- movq mm3, mm1 /* mm3 = 74 64 54 44 */
- movq mm7, mm5 /* mm7 = 76 66 56 46 */
- punpcklwd mm1, mm2 /* mm1 = 55 54 45 44 */
- punpckhwd mm3, mm2 /* mm3 = 75 74 65 64 */
- punpcklwd mm5, mm6 /* mm5 = 57 56 47 46 */
- punpckhwd mm7, mm6 /* mm7 = 77 76 67 66 */
- movq mm2, mm1 /* mm2 = 55 54 45 44 */
- movq mm6, mm3 /* mm6 = 75 74 65 64 */
- punpckldq mm1, mm5 /* mm1 = 47 46 45 44 */
- punpckhdq mm2, mm5 /* mm2 = 57 56 55 54 */
- punpckldq mm3, mm7 /* mm3 = 67 66 65 64 */
- punpckhdq mm6, mm7 /* mm6 = 77 76 75 74 */
-
- movq mm5, [edi] /* mm5 = 43 42 41 40 */
- packuswb mm5, mm1 /* mm5 = 47 46 45 44 43 42 41 40 */
-
- movq [eax], mm5 /* write 40 41 42 43 44 45 46 47 */
- movq mm7, [edi+16] /* mm7 = 53 52 51 50 */
- packuswb mm7, mm2 /* mm7 = 57 56 55 54 53 52 51 50 */
- movq [eax+ecx], mm7 /* write 50 51 52 53 54 55 56 57 */
- packuswb mm0, mm3 /* mm0 = 67 66 65 64 63 62 61 60 */
- packuswb mm4, mm6 /* mm4 = 77 76 75 74 73 72 71 70 */
-
- movq [eax+ecx*2], mm0 /* write 60 61 62 63 64 65 66 67 */
- lea eax, [eax+ecx*4] /* mov forward the desPtr */
- movq [eax+edx], mm4 /* write 70 71 72 73 74 75 76 77 */
-
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebp
- pop eax
- }//__asm
- Var1 = Variance11[0]+ Variance11[1]+Variance11[2]+Variance11[3];
- Var1 += Variance11[4]+ Variance11[5]+Variance11[6]+Variance11[7];
- pbi->FragmentVariances[CurrentFrag-1] += Var1;
- Var2 = Variance21[0]+ Variance21[1]+Variance21[2]+Variance21[3];
- Var2 += Variance21[4]+ Variance21[5]+Variance21[6]+Variance21[7];
- pbi->FragmentVariances[CurrentFrag] += Var2;
- CurrentFrag ++;
- }//else
-
- }//while
- }
- /****************************************************************************
- *
- * ROUTINE : PlaneAddNoise_mmx
- *
- * INPUTS : UINT8 *Start starting address of buffer to add gaussian
- * noise to
- * UINT32 Width width of plane
- * UINT32 Height height of plane
- * INT32 Pitch distance between subsequent lines of frame
- * INT32 q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
- void PlaneAddNoise_mmx( UINT8 *Start, UINT32 Width, UINT32 Height, INT32 Pitch, int q)
- {
- unsigned int i;
- INT32 Pitch4 = Pitch * 4;
- const int noiseAmount = 2;
- const int noiseAdder = 2 * noiseAmount + 1;
- #if defined(_WIN32_WCE)
- #pragma pack(16)
- unsigned char blackclamp[16];
- unsigned char whiteclamp[16];
- unsigned char bothclamp[16];
- #pragma pack()
- #else
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
- #endif
- char CharDist[300];
- char Rand[2048] =
- {
- -2,0,-2,2,0,0,-1,2,2,1,-2,2,1,0,-1,-2,-2,-1,-2,-2,2,0,-2,-2,-2,-1,0,0,1,1,-2,1,0,-1,-2,1,1,2,0,-1,2,1,2,2,0,-2,0,-1,2,-1,1,2,2,2,1,-1,-1,-1,2,-2,-1,-2,1,-2,-2,2,-1,-1,0,1,2,1,0,-1,1,0,0,2,1,-2,0,-1,1,1,0,-1,-2,-1,0,2,0,2,1,-1,-2,1,0,-2,1,0,-2,2,-2,2,1,-1,0,-2,2,1,-2,2,2,0,-2,-2,2,0,-2,0,1,0,-1,0,1,1,1,0,-2,-1,2,-2,0,1,0,-2,2,2,0,-1,0,-1,2,-1,0,-1,2,-1,1,0,-2,1,2,-1,0,2,-2,2,0,-2,0,-2,2,1,1,-2,2,-2,-2,1,-1,2,-1,-1,-2,1,2,1,1,1,-1,-2,-2,-2,2,2,-1,-2,0,-2,-2,0,1,1,0,-2,0,-1,1,-1,0,-1,0,0,1,-2,0,2,1,2,-2,-1,-2,2,0,2,-2,1,-2,0,2,-2,2,-1,-1,1,0,-1,1,1,0,0,0,1,2,2,1,1,0,-1,-2,1,0,2,-1,-2,1,1,0,-1,0,-2,1,1,1,1,2,-2,0,2,2,1,1,-2,1,2,-1,0,-1,-2,-2,2,2,1,-2,-1,-2,-2,1,2,0,0,0,-1,0,0,-2,-1,1,-1,2,2,2,1,-1,2,-2,-2,1,0,1,2,-2,2,1,-1,-2,0,-1,-1,2,0,1,-2,0,-1,0,1,0,-1,1,0,1,-1,-2,1,-2,1,2,0,1,0,-1,1,0,-1,2,1,-2,-1,-2,1,2,1,-2,-1,-2,1,-2,2,2,0,1,2,-2,-2,1,1,-1,-2,-2,1,-1,-1,-1,1,2,2,0,1,1,2,-2,1,0,-1,-2,2,-2,0,0,-1,0,-1,-1,-2,2,-2,-1,1,2,1,1,1,-1,2,-1,2,-1,-1,0,2,-2,-2,0,0,-2,-1,2,-1,-2,-2,2,-2,-2,-2,-1,2,-1,0,2,2,0,2,1,-1,-1,-2,0,2,-1,-1,0,-1,1,2,0,2,-2,2,1,1,0,-2,-1,-1,-2,0,-2,1,2,-2,2,1,1,2,0,1,-2,1,1,1,-2,2,1,1,-2,0,2,-2,-1,-2,2,1,-1,2,-1,1,-1,-2,-1,0,2,-2,2,0,-2,1,-2,2,1,2,-1,0,-2,1,-2,0,-1,2,-2,-1,-2,-1,-2,1,2,2,-2,1,1,1,2,0,2,1,-2,1,0,0,2,0,0,0,-1,-1,-1,-2,1,-2,-2,-1,0,-2,
- -2,-2,1,0,1,1,0,1,-1,2,0,-2,2,2,-1,2,-2,2,0,0,1,1,-2,-1,-1,0,2,1,1,2,-1,-1,2,-1,-1,0,-1,1,1,1,1,-2,-1,-1,1,2,-1,0,-2,2,-1,0,1,0,1,-2,-2,-2,-2,-1,-1,1,-2,-1,-2,1,1,-2,1,1,1,0,-2,0,-2,2,0,2,1,0,1,1,-1,-1,-2,2,-2,-2,-1,1,-1,-1,0,-2,0,0,1,1,0,-1,2,2,1,2,-2,0,2,-1,-1,-1,-2,1,-1,-2,-2,0,2,2,0,1,1,2,2,0,0,-2,1,0,0,0,0,2,1,-1,-2,-1,-1,-1,1,-1,2,-2,1,1,2,-2,0,2,1,2,-2,2,1,2,2,2,1,-2,1,-1,-1,1,1,-2,1,0,-2,2,2,-2,-1,0,0,1,-2,1,2,-2,1,1,-2,-2,-1,1,2,0,-1,1,-1,1,-1,-1,2,-1,-2,1,-2,-2,-2,-1,1,-1,0,0,-2,0,1,-1,1,2,0,0,-2,0,-1,0,2,0,-2,0,1,1,2,2,-1,2,1,1,2,1,2,2,2,0,0,-2,-1,2,0,-2,-2,1,1,-2,-2,-1,1,2,-2,-2,-2,-1,-2,2,1,-2,2,1,0,-2,-1,-1,1,1,-2,2,-2,1,0,2,0,-1,-1,1,-1,0,1,-2,2,1,-2,0,1,2,1,1,1,2,1,-1,0,-1,0,1,-1,0,0,2,1,1,1,0,1,1,2,-1,1,2,0,2,0,0,0,2,2,-2,-1,-1,1,2,1,-2,1,-2,0,0,0,-2,2,-2,1,-2,-2,1,-1,-1,1,0,0,-1,1,-2,0,0,2,0,-2,-1,-1,-2,2,1,2,1,1,0,1,1,2,0,-1,-2,2,2,0,-2,2,1,-2,0,2,-2,-2,-1,-2,0,-2,1,0,1,1,2,1,-1,2,-1,2,1,-1,-2,-1,-2,0,-2,2,-2,-1,-1,-2,-2,-2,1,1,2,-2,0,0,2,0,0,1,-1,0,-2,2,2,2,-2,0,1,1,1,-1,2,1,-2,0,-2,0,1,1,-2,1,0,2,2,1,-1,-1,0,-2,1,-2,1,1,-1,-2,-2,1,-2,-1,1,1,0,2,1,-1,0,2,-2,-2,-2,-2,2,-1,-1,2,-2,2,-1,2,-1,-1,-1,-1,2,2,2,2,1,-2,-2,-2,-1,0,-2,2,1,0,2,0,1,2,2,2,2,-2,-1,-1,-2,2,1,1,-2,1,2,1,2,-2,1,-1,1,2,2,-2,1,0,-2,-1,0,-2,2,0,-1,1,2,-1,-2,1,-1,0,2,2,-1,0,2,2,1,
- -1,2,-1,-1,-2,0,-1,-2,-1,2,-1,2,-2,2,2,0,-1,1,0,1,0,-2,2,-2,-1,-1,1,0,2,1,1,0,2,1,-2,0,-2,-2,1,-1,2,0,1,-2,1,-2,1,2,0,1,-1,2,1,0,-1,2,0,1,-1,-2,0,1,0,-1,-2,-1,0,2,0,2,-1,0,-2,2,2,0,1,-1,1,0,0,-2,-1,-1,2,2,2,1,0,-2,0,-1,0,-2,2,-1,1,2,0,-1,-1,0,2,-1,-1,1,2,-1,-2,0,2,0,-2,2,-2,1,-1,-2,-2,-1,0,2,-2,-2,-1,-1,0,0,0,2,1,-1,0,0,2,0,2,1,2,0,2,-1,2,-1,2,1,-2,1,0,-2,-2,-2,0,2,-2,-2,-1,2,1,1,1,-1,1,2,2,-1,0,-2,-2,-2,-1,1,0,-2,-1,-2,1,-2,-2,0,-1,2,-2,2,-2,-2,-2,2,-1,0,-1,0,1,2,2,2,-2,-2,0,2,2,-2,2,2,-1,0,1,0,-1,2,2,1,0,-1,-2,-2,1,0,-1,-1,0,1,2,1,2,-1,0,-1,2,0,-1,0,0,-1,-1,-2,-1,-1,2,1,2,1,1,-1,1,-2,1,2,-1,-2,0,-2,2,1,0,1,0,1,1,1,1,2,-2,0,1,-2,0,-2,0,-1,-2,-1,2,0,1,-2,-1,2,2,-1,-1,-1,-2,2,-2,-2,-1,-1,1,1,-2,-1,-2,-1,0,-2,1,-2,0,1,-1,-2,-1,1,2,0,2,-2,1,2,1,1,0,0,-2,2,-1,-2,-1,-1,0,1,-1,2,-1,1,-1,-2,1,-1,-1,1,2,-1,2,-1,2,1,-1,-1,-1,0,-1,-1,-2,-2,1,2,1,2,-2,0,1,2,-1,1,1,2,2,2,1,-1,1,-2,0,1,-1,2,-2,0,-2,1,-1,-2,-1,-2,2,1,-2,0,-2,2,-2,0,2,0,2,0,0,0,1,2,2,-1,-2,1,-2,1,0,2,1,-1,0,-1,1,2,-2,-2,-1,-1,-1,2,2,-1,-2,0,0,2,0,-1,0,-1,0,2,-1,-1,2,0,0,1,1,-2,-2,-1,-2,-1,0,1,-1,-2,1,-2,-1,2,0,2,-1,-2,0,-1,-2,0,1,-2,2,-1,2,0,-1,-1,0,-1,0,1,2,-1,0,1,1,-2,-2,1,2,1,-1,0,-2,0,-2,-1,2,-1,-1,-2,-1,-2,-1,-1,-2,-1,-2,0,2,2,0,2,-2,0,0,1,-1,2,-1,-1,2,2,1,1,-2,-1,-1,2,2,0,1,-1,2,0,-2,2,-2,-1,-1,1,0,0,-2,
- 2,-2,-2,2,0,1,-2,-2,0,1,0,2,2,-1,0,2,-2,2,0,-1,-2,-1,-2,-2,-2,2,0,1,-1,1,1,2,2,2,-1,-2,-2,2,-2,2,-1,2,-1,-1,1,2,-1,0,1,-1,0,0,2,1,1,0,2,0,-1,-1,-2,2,1,-1,-1,-1,-1,-2,2,-1,0,-2,2,1,1,-2,0,1,0,1,2,-2,-1,2,1,-2,2,-2,1,-2,-2,-2,0,0,0,-1,-2,-1,-2,0,-2,-1
- };
- double sigma;
- __asm emms
- sigma = 1 + .8*(63-q) / 63.0;
- // set up a lookup table of 256 entries that matches
- // a gaussian distribution with sigma determined by q.
- //
- {
- double i,sum=0;
- int next,j;
- next=0;
- for(i=-32;i<32;i++)
- {
- int a = (int)(.5+256*gaussian(sigma,0,i));
- if(a)
- {
- for(j=0;j<a;j++)
- {
- CharDist[next+j]=(char) i;
- }
- next = next+j;
- }
- }
- for(next=next;next<256;next++)
- CharDist[next] = 0;
- }
- for(i=0;i<2048;i++)
- {
- Rand[i]=CharDist[rand() & 0xff];
- }
- for(i=0;i<16;i++)
- {
- blackclamp[i]=-CharDist[0];
- whiteclamp[i]=-CharDist[0];
- bothclamp[i]=-2*CharDist[0];
- }
- for(i=0;i<Height;i++)
- {
- UINT8 *Pos = Start + i *Pitch;
- INT8 *Ref = Rand + (rand() & 0xff);
- __asm
- {
- mov ecx, [Width]
- mov esi,Pos
- mov edi,Ref
- xor eax,eax
- nextset:
- movq mm1,[esi+eax] // get the source
- psubusb mm1,blackclamp // clamp both sides so we don't outrange adding noise
- paddusb mm1,bothclamp
- psubusb mm1,whiteclamp
- movq mm2,[edi+eax] // get the noise for this line
- paddb mm1,mm2 // add it in
- movq [esi+eax],mm1 // store the result
- add eax,8 // move to the next line
- cmp eax, ecx
- jl nextset
- }
- }
- }
|