mmxidct.c 61 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156
  1. /****************************************************************************
  2. *
  3. * Module Title : IDCTPart.c
  4. *
  5. * Description : IDCT with multiple versions based on # of non 0 coeffs
  6. *
  7. * AUTHOR : Scott Lavarnway, Tim Murphy
  8. *
  9. *****************************************************************************
  10. * Revision History
  11. *
  12. * 1.02 JBB 15 Nov 00 Cleaned out unused ifdefs
  13. * 1.01 YWX 15/05/00 Added MMX_idct3 for use in PostProcesser
  14. * 1.00 YWX 14/05/00 Configuration baseline from Scott Lavarnway
  15. *
  16. *****************************************************************************
  17. */
  18. // Dequantization + inverse discrete cosine transform.
  19. // Timothy S. Murphy 14 July 1999.
  20. #pragma warning(disable:4005)
  21. #include "codec_common.h"
  22. #include <math.h>
  23. #include <memory.h>
  24. #undef PI
  25. #define PI 3.14159265358979323846
  26. // Constants used in MMX implementation of dequantization and idct.
  27. // All the MMX stuff works with 4 16-bit quantities at a time and
  28. // we create 11 constants of size 4 x 16 bits.
  29. // The first 4 are used to mask the individual 16-bit words within a group
  30. // and are used in the address-shuffling part of the dequantization.
  31. // The last 7 are fixed-point approximations to the cosines of angles
  32. // occurring in the DCT; each of these contains 4 copies of the same value.
  33. // There is only one (statically initialized) instance of this object
  34. // wrapped in an allocator object that forces its starting address
  35. // to be evenly divisible by 32. Hence the actual object occupies 2.75
  36. // cache lines on a Pentium processor.
  37. // Offsets in bytes used by the assembler code below
  38. // must of course agree with the idctConstants constructor.
  39. #define MaskOffset 0 // 4 masks come in order low word to high
  40. #define CosineOffset 32 // 7 cosines come in order pi/16 * (1 ... 7)
  41. #define EightOffset 88
  42. #define IdctAdjustBeforeShift 8
  43. #pragma warning( disable : 4799 ) // Disable no emms instruction warning!
  44. UINT16 idctconstants[(4+7+1) * 4];
  45. UINT16 idctcosTbl[ 7] =
  46. {
  47. 64277, 60547, 54491, 46341, 36410, 25080, 12785
  48. };
  49. /* Dequantization + inverse DCT.
  50. Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
  51. by unsigned 16-bit quantization table entries.
  52. These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
  53. Result is scaled signed DCT coefficients (abs value < 2^15).
  54. In the data stream, the coefficients are sent in order of increasing
  55. total (horizontal + vertical) frequency. The exact picture is as follows:
  56. 00 01 05 06 16 17 33 34
  57. 02 04 07 15 20 32 35 52
  58. 03 10 14 21 31 36 51 53
  59. 11 13 22 30 37 50 54 65
  60. 12 23 27 40 47 55 64 66
  61. 24 26 41 46 56 63 67 74
  62. 25 42 45 57 62 70 73 75
  63. 43 44 60 61 71 72 76 77
  64. Here the position in the matrix corresponds to the (horiz,vert)
  65. freqency indices and the octal entry in the matrix is the position
  66. of the coefficient in the data stream. Thus the coefficients are sent
  67. in sort of a diagonal "snake".
  68. The dequantization stage "uncurls the snake" and stores the expanded
  69. coefficients in more convenient positions. These are not exactly the
  70. natural positions given above but take into account our implementation
  71. of the idct, which basically requires two one-dimensional idcts and
  72. two transposes.
  73. We fold the first transpose into the storage of the expanded coefficients.
  74. We don't actually do a full transpose because this would require doubling
  75. the size of the idct buffer; rather, we just transpose each of the 4x4
  76. subblocks. Using slightly varying addressing schemes in each of the
  77. four 4x8 idcts then allows these transforms to be done in place.
  78. Transposing the 4x4 subblocks in the matrix above gives
  79. 00 02 03 11 16 20 31 37
  80. 01 04 10 13 17 32 36 50
  81. 05 07 14 22 33 35 51 54
  82. 06 15 21 30 34 52 53 65
  83. 12 24 25 43 47 56 62 71
  84. 23 26 42 44 55 63 70 72
  85. 27 41 45 60 64 67 73 76
  86. 40 46 57 61 66 74 75 77
  87. Finally, we reverse the words in each 4 word group to clarify
  88. direction of shifts.
  89. 11 03 02 00 37 31 20 16
  90. 13 10 04 01 50 36 32 17
  91. 22 14 07 05 54 51 35 33
  92. 30 21 15 06 65 53 52 34
  93. 43 25 24 12 71 62 56 47
  94. 44 42 26 23 72 70 63 55
  95. 60 45 41 27 76 73 67 64
  96. 61 57 46 40 77 75 74 66
  97. This matrix then shows the 16 4x16 destination words in terms of
  98. the 16 4x16 input words.
  99. We implement this algorithm by manipulation of mmx registers,
  100. which seems to be the fastest way to proceed. It is completely
  101. hand-written; there does not seem to be enough recurrence to
  102. reasonably compartmentalize any of it. Hence the resulting
  103. program is ugly and bloated. Furthermore, due to the absence of
  104. register pressure, it is boring and artless. I hate it.
  105. The idct itself is more interesting. Since the two-dimensional dct
  106. basis functions are products of the one-dimesional dct basis functions,
  107. we can compute an inverse (or forward) dct via two 1-D transforms,
  108. on rows then on columns. To exploit MMX parallelism, we actually do
  109. both operations on columns, interposing a (partial) transpose between
  110. the two 1-D transforms, the first transpose being done by the expansion
  111. described above.
  112. The 8-sample one-dimensional DCT is a standard orthogonal expansion using
  113. the (unnormalized) basis functions
  114. b[k]( i) = cos( pi * k * (2i + 1) / 16);
  115. here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
  116. To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
  117. should be multiplied by 1/2.
  118. The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
  119. in each direction. The (unnormalized) basis functions are
  120. B[k,l]( i, j) = b[k]( i) * b[l]( j);
  121. this time k and l are the horizontal and vertical frequencies,
  122. i and j are the horizontal and vertical spatial coordinates;
  123. all indices vary from 0 ... 7 (as above)
  124. and there are now 4 cases of normalization.
  125. Our 1-D idct expansion uses constants C1 ... C7 given by
  126. (*) Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)
  127. and the following 1-D algorithm transforming I0 ... I7 to R0 ... R7 :
  128. A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
  129. C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
  130. A. = C4 * (A - C) B. = C4 * (B - D)
  131. C. = A + C D. = B + D
  132. E = C4 * (I0 + I4) F = C4 * (I0 - I4)
  133. G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
  134. E. = E - G
  135. G. = E + G
  136. A.. = F + A. B.. = B. - H
  137. F. = F - A. H. = B. + H
  138. R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
  139. R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
  140. This algorithm was also used by Paul Wilkins in his C implementation;
  141. it is due to Vetterli and Lightenberg and may be found in the JPEG
  142. reference book by Pennebaker and Mitchell.
  143. Correctness of the algorithm follows from (*) together with the
  144. addition formulas for sine and cosine:
  145. cos( A + B) = cos( A) * cos( B) - sin( A) * sin( B)
  146. sin( A + B) = sin( A) * cos( B) + cos( A) * sin( B)
  147. Note that this implementation absorbs the difference in normalization
  148. between the 0th and higher frequencies, although the results produced
  149. are actually twice as big as they should be. Since we do this for each
  150. dimension, the 2-D idct results are 4x the desired results. Finally,
  151. taking into account that the dequantization multiplies by 4 as well,
  152. our actual results are 16x too big. We fix this by shifting the final
  153. results right by 4 bits.
  154. High precision version approximates C1 ... C7 to 16 bits.
  155. Since MMX only provides a signed multiply, C1 ... C5 appear to be
  156. negative and multiplies involving them must be adjusted to compensate
  157. for this. C6 and C7 do not require this adjustment since
  158. they are < 1/2 and are correctly treated as positive numbers.
  159. Following macro does four 8-sample one-dimensional idcts in parallel.
  160. This is actually not such a difficult program to write once you
  161. make a couple of observations (I of course was unable to make these
  162. observations until I'd half-written a couple of other versions).
  163. 1. Everything is easy once you are done with the multiplies.
  164. This is because, given X and Y in registers, one may easily
  165. calculate X+Y and X-Y using just those 2 registers.
  166. 2. You always need at least 2 extra registers to calculate products,
  167. so storing 2 temporaries is inevitable. C. and D. seem to be
  168. the best candidates.
  169. 3. The products should be calculated in decreasing order of complexity
  170. (which translates into register pressure). Since C1 ... C5 require
  171. adjustment (and C6, C7 do not), we begin by calculating C and D.
  172. */
  173. /**************************************************************************************
  174. *
  175. * Routine: BeginIDCT
  176. *
  177. * Description: The Macro does IDct on 4 1-D Dcts
  178. *
  179. * Input: None
  180. *
  181. * Output: None
  182. *
  183. * Return: None
  184. *
  185. * Special Note: None
  186. *
  187. * Error: None
  188. *
  189. ***************************************************************************************
  190. */
  191. #define Dump __asm call MMX_dump
  192. #define BeginIDCT __asm { \
  193. \
  194. __asm movq r2, I(3) \
  195. \
  196. __asm movq r6, C(3) \
  197. __asm movq r4, r2 \
  198. __asm movq r7, J(5) \
  199. __asm pmulhw r4, r6 /* r4 = c3*i3 - i3 */ \
  200. __asm movq r1, C(5) \
  201. __asm pmulhw r6, r7 /* r6 = c3*i5 - i5 */ \
  202. __asm movq r5, r1 \
  203. __asm pmulhw r1, r2 /* r1 = c5*i3 - i3 */ \
  204. __asm movq r3, I(1) \
  205. __asm pmulhw r5, r7 /* r5 = c5*i5 - i5 */ \
  206. __asm movq r0, C(1) /* (all registers are in use) */ \
  207. __asm paddw r4, r2 /* r4 = c3*i3 */ \
  208. __asm paddw r6, r7 /* r6 = c3*i5 */ \
  209. __asm paddw r2, r1 /* r2 = c5*i3 */ \
  210. __asm movq r1, J(7) \
  211. __asm paddw r7, r5 /* r7 = c5*i5 */ \
  212. __asm movq r5, r0 /* r5 = c1 */ \
  213. __asm pmulhw r0, r3 /* r0 = c1*i1 - i1 */ \
  214. __asm paddsw r4, r7 /* r4 = C = c3*i3 + c5*i5 */ \
  215. __asm pmulhw r5, r1 /* r5 = c1*i7 - i7 */ \
  216. __asm movq r7, C(7) \
  217. __asm psubsw r6, r2 /* r6 = D = c3*i5 - c5*i3 (done w/r2) */ \
  218. __asm paddw r0, r3 /* r0 = c1*i1 */ \
  219. __asm pmulhw r3, r7 /* r3 = c7*i1 */ \
  220. __asm movq r2, I(2) \
  221. __asm pmulhw r7, r1 /* r7 = c7*i7 */ \
  222. __asm paddw r5, r1 /* r5 = c1*i7 */ \
  223. __asm movq r1, r2 /* r1 = i2 */ \
  224. __asm pmulhw r2, C(2) /* r2 = c2*i2 - i2 */ \
  225. __asm psubsw r3, r5 /* r3 = B = c7*i1 - c1*i7 */ \
  226. __asm movq r5, J(6) \
  227. __asm paddsw r0, r7 /* r0 = A = c1*i1 + c7*i7 */ \
  228. __asm movq r7, r5 /* r7 = i6 */ \
  229. __asm psubsw r0, r4 /* r0 = A - C */ \
  230. __asm pmulhw r5, C(2) /* r5 = c2*i6 - i6 */ \
  231. __asm paddw r2, r1 /* r2 = c2*i2 */ \
  232. __asm pmulhw r1, C(6) /* r1 = c6*i2 */ \
  233. __asm paddsw r4, r4 /* r4 = C + C */ \
  234. __asm paddsw r4, r0 /* r4 = C. = A + C */ \
  235. __asm psubsw r3, r6 /* r3 = B - D */ \
  236. __asm paddw r5, r7 /* r5 = c2*i6 */ \
  237. __asm paddsw r6, r6 /* r6 = D + D */ \
  238. __asm pmulhw r7, C(6) /* r7 = c6*i6 */ \
  239. __asm paddsw r6, r3 /* r6 = D. = B + D */ \
  240. __asm movq I(1), r4 /* save C. at I(1) */ \
  241. __asm psubsw r1, r5 /* r1 = H = c6*i2 - c2*i6 */ \
  242. __asm movq r4, C(4) \
  243. __asm movq r5, r3 /* r5 = B - D */ \
  244. __asm pmulhw r3, r4 /* r3 = (c4 - 1) * (B - D) */ \
  245. __asm paddsw r7, r2 /* r7 = G = c6*i6 + c2*i2 */ \
  246. __asm movq I(2), r6 /* save D. at I(2) */ \
  247. __asm movq r2, r0 /* r2 = A - C */ \
  248. __asm movq r6, I(0) \
  249. __asm pmulhw r0, r4 /* r0 = (c4 - 1) * (A - C) */ \
  250. __asm paddw r5, r3 /* r5 = B. = c4 * (B - D) */ \
  251. \
  252. __asm movq r3, J(4) \
  253. __asm psubsw r5, r1 /* r5 = B.. = B. - H */ \
  254. __asm paddw r2, r0 /* r0 = A. = c4 * (A - C) */ \
  255. __asm psubsw r6, r3 /* r6 = i0 - i4 */ \
  256. __asm movq r0, r6 \
  257. __asm pmulhw r6, r4 /* r6 = (c4 - 1) * (i0 - i4) */ \
  258. __asm paddsw r3, r3 /* r3 = i4 + i4 */ \
  259. __asm paddsw r1, r1 /* r1 = H + H */ \
  260. __asm paddsw r3, r0 /* r3 = i0 + i4 */ \
  261. __asm paddsw r1, r5 /* r1 = H. = B + H */ \
  262. __asm pmulhw r4, r3 /* r4 = (c4 - 1) * (i0 + i4) */ \
  263. __asm paddsw r6, r0 /* r6 = F = c4 * (i0 - i4) */ \
  264. __asm psubsw r6, r2 /* r6 = F. = F - A. */ \
  265. __asm paddsw r2, r2 /* r2 = A. + A. */ \
  266. __asm movq r0, I(1) /* r0 = C. */ \
  267. __asm paddsw r2, r6 /* r2 = A.. = F + A. */ \
  268. __asm paddw r4, r3 /* r4 = E = c4 * (i0 + i4) */ \
  269. __asm psubsw r2, r1 /* r2 = R2 = A.. - H. */ \
  270. }
  271. // end BeginIDCT macro (38 cycles).
  272. // Two versions of the end of the idct depending on whether we're feeding
  273. // into a transpose or dividing the final results by 16 and storing them.
  274. /**************************************************************************************
  275. *
  276. * Routine: RowIDCT
  277. *
  278. * Description: The Macro does 1-D IDct on 4 Rows
  279. *
  280. * Input: None
  281. *
  282. * Output: None
  283. *
  284. * Return: None
  285. *
  286. * Special Note: None
  287. *
  288. * Error: None
  289. *
  290. ***************************************************************************************
  291. */
  292. // RowIDCT gets ready to transpose.
  293. #define RowIDCT __asm { \
  294. \
  295. BeginIDCT \
  296. \
  297. __asm movq r3, I(2) /* r3 = D. */ \
  298. __asm psubsw r4, r7 /* r4 = E. = E - G */ \
  299. __asm paddsw r1, r1 /* r1 = H. + H. */ \
  300. __asm paddsw r7, r7 /* r7 = G + G */ \
  301. __asm paddsw r1, r2 /* r1 = R1 = A.. + H. */ \
  302. __asm paddsw r7, r4 /* r7 = G. = E + G */ \
  303. __asm psubsw r4, r3 /* r4 = R4 = E. - D. */ \
  304. __asm paddsw r3, r3 \
  305. __asm psubsw r6, r5 /* r6 = R6 = F. - B.. */ \
  306. __asm paddsw r5, r5 \
  307. __asm paddsw r3, r4 /* r3 = R3 = E. + D. */ \
  308. __asm paddsw r5, r6 /* r5 = R5 = F. + B.. */ \
  309. __asm psubsw r7, r0 /* r7 = R7 = G. - C. */ \
  310. __asm paddsw r0, r0 \
  311. __asm movq I(1), r1 /* save R1 */ \
  312. __asm paddsw r0, r7 /* r0 = R0 = G. + C. */ \
  313. }
  314. // end RowIDCT macro (8 + 38 = 46 cycles)
  315. /**************************************************************************************
  316. *
  317. * Routine: ColumnIDCT
  318. *
  319. * Description: The Macro does 1-D IDct on 4 columns
  320. *
  321. * Input: None
  322. *
  323. * Output: None
  324. *
  325. * Return: None
  326. *
  327. * Special Note: None
  328. *
  329. * Error: None
  330. *
  331. ***************************************************************************************
  332. */
  333. // Column IDCT normalizes and stores final results.
  334. #define ColumnIDCT __asm { \
  335. \
  336. BeginIDCT \
  337. \
  338. __asm paddsw r2, Eight /* adjust R2 (and R1) for shift */ \
  339. __asm paddsw r1, r1 /* r1 = H. + H. */ \
  340. __asm paddsw r1, r2 /* r1 = R1 = A.. + H. */ \
  341. __asm psraw r2, 4 /* r2 = NR2 */ \
  342. __asm psubsw r4, r7 /* r4 = E. = E - G */ \
  343. __asm psraw r1, 4 /* r1 = NR1 */ \
  344. __asm movq r3, I(2) /* r3 = D. */ \
  345. __asm paddsw r7, r7 /* r7 = G + G */ \
  346. __asm movq I(2), r2 /* store NR2 at I2 */ \
  347. __asm paddsw r7, r4 /* r7 = G. = E + G */ \
  348. __asm movq I(1), r1 /* store NR1 at I1 */ \
  349. __asm psubsw r4, r3 /* r4 = R4 = E. - D. */ \
  350. __asm paddsw r4, Eight /* adjust R4 (and R3) for shift */ \
  351. __asm paddsw r3, r3 /* r3 = D. + D. */ \
  352. __asm paddsw r3, r4 /* r3 = R3 = E. + D. */ \
  353. __asm psraw r4, 4 /* r4 = NR4 */ \
  354. __asm psubsw r6, r5 /* r6 = R6 = F. - B.. */ \
  355. __asm psraw r3, 4 /* r3 = NR3 */ \
  356. __asm paddsw r6, Eight /* adjust R6 (and R5) for shift */ \
  357. __asm paddsw r5, r5 /* r5 = B.. + B.. */ \
  358. __asm paddsw r5, r6 /* r5 = R5 = F. + B.. */ \
  359. __asm psraw r6, 4 /* r6 = NR6 */ \
  360. __asm movq J(4), r4 /* store NR4 at J4 */ \
  361. __asm psraw r5, 4 /* r5 = NR5 */ \
  362. __asm movq I(3), r3 /* store NR3 at I3 */ \
  363. __asm psubsw r7, r0 /* r7 = R7 = G. - C. */ \
  364. __asm paddsw r7, Eight /* adjust R7 (and R0) for shift */ \
  365. __asm paddsw r0, r0 /* r0 = C. + C. */ \
  366. __asm paddsw r0, r7 /* r0 = R0 = G. + C. */ \
  367. __asm psraw r7, 4 /* r7 = NR7 */ \
  368. __asm movq J(6), r6 /* store NR6 at J6 */ \
  369. __asm psraw r0, 4 /* r0 = NR0 */ \
  370. __asm movq J(5), r5 /* store NR5 at J5 */ \
  371. \
  372. __asm movq J(7), r7 /* store NR7 at J7 */ \
  373. \
  374. __asm movq I(0), r0 /* store NR0 at I0 */ \
  375. \
  376. }
  377. // end ColumnIDCT macro (38 + 19 = 57 cycles)
  378. /**************************************************************************************
  379. *
  380. * Routine: Transpose
  381. *
  382. * Description: The Macro does two 4x4 transposes in place.
  383. *
  384. * Input: None
  385. *
  386. * Output: None
  387. *
  388. * Return: None
  389. *
  390. * Special Note: None
  391. *
  392. * Error: None
  393. *
  394. ***************************************************************************************
  395. */
  396. /* Following macro does two 4x4 transposes in place.
  397. At entry (we assume):
  398. r0 = a3 a2 a1 a0
  399. I(1) = b3 b2 b1 b0
  400. r2 = c3 c2 c1 c0
  401. r3 = d3 d2 d1 d0
  402. r4 = e3 e2 e1 e0
  403. r5 = f3 f2 f1 f0
  404. r6 = g3 g2 g1 g0
  405. r7 = h3 h2 h1 h0
  406. At exit, we have:
  407. I(0) = d0 c0 b0 a0
  408. I(1) = d1 c1 b1 a1
  409. I(2) = d2 c2 b2 a2
  410. I(3) = d3 c3 b3 a3
  411. J(4) = h0 g0 f0 e0
  412. J(5) = h1 g1 f1 e1
  413. J(6) = h2 g2 f2 e2
  414. J(7) = h3 g3 f3 e3
  415. I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
  416. J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
  417. Since r1 is free at entry, we calculate the Js first. */
  418. #define Transpose __asm { \
  419. \
  420. __asm movq r1, r4 /* r1 = e3 e2 e1 e0 */ \
  421. __asm punpcklwd r4, r5 /* r4 = f1 e1 f0 e0 */ \
  422. __asm movq I(0), r0 /* save a3 a2 a1 a0 */ \
  423. __asm punpckhwd r1, r5 /* r1 = f3 e3 f2 e2 */ \
  424. __asm movq r0, r6 /* r0 = g3 g2 g1 g0 */ \
  425. __asm punpcklwd r6, r7 /* r6 = h1 g1 h0 g0 */ \
  426. __asm movq r5, r4 /* r5 = f1 e1 f0 e0 */ \
  427. __asm punpckldq r4, r6 /* r4 = h0 g0 f0 e0 = R4 */ \
  428. __asm punpckhdq r5, r6 /* r5 = h1 g1 f1 e1 = R5 */ \
  429. __asm movq r6, r1 /* r6 = f3 e3 f2 e2 */ \
  430. __asm movq J(4), r4 \
  431. __asm punpckhwd r0, r7 /* r0 = h3 g3 h2 g2 */ \
  432. __asm movq J(5), r5 \
  433. __asm punpckhdq r6, r0 /* r6 = h3 g3 f3 e3 = R7 */ \
  434. __asm movq r4, I(0) /* r4 = a3 a2 a1 a0 */ \
  435. __asm punpckldq r1, r0 /* r1 = h2 g2 f2 e2 = R6 */ \
  436. __asm movq r5, I(1) /* r5 = b3 b2 b1 b0 */ \
  437. __asm movq r0, r4 /* r0 = a3 a2 a1 a0 */ \
  438. __asm movq J(7), r6 \
  439. __asm punpcklwd r0, r5 /* r0 = b1 a1 b0 a0 */ \
  440. __asm movq J(6), r1 \
  441. __asm punpckhwd r4, r5 /* r4 = b3 a3 b2 a2 */ \
  442. __asm movq r5, r2 /* r5 = c3 c2 c1 c0 */ \
  443. __asm punpcklwd r2, r3 /* r2 = d1 c1 d0 c0 */ \
  444. __asm movq r1, r0 /* r1 = b1 a1 b0 a0 */ \
  445. __asm punpckldq r0, r2 /* r0 = d0 c0 b0 a0 = R0 */ \
  446. __asm punpckhdq r1, r2 /* r1 = d1 c1 b1 a1 = R1 */ \
  447. __asm movq r2, r4 /* r2 = b3 a3 b2 a2 */ \
  448. __asm movq I(0), r0 \
  449. __asm punpckhwd r5, r3 /* r5 = d3 c3 d2 c2 */ \
  450. __asm movq I(1), r1 \
  451. __asm punpckhdq r4, r5 /* r4 = d3 c3 b3 a3 = R3 */ \
  452. __asm punpckldq r2, r5 /* r2 = d2 c2 b2 a2 = R2 */ \
  453. \
  454. __asm movq I(3), r4 \
  455. \
  456. __asm movq I(2), r2 \
  457. \
  458. }
  459. // end Transpose macro (19 cycles).
  460. /*
  461. __declspec( naked) static void MMX_dump()
  462. {
  463. __asm
  464. {
  465. movq [edi], mm0
  466. movq [edi+8], mm1
  467. movq [edi+16], mm2
  468. movq [edi+24], mm3
  469. movq [edi+32], mm4
  470. movq [edi+40], mm5
  471. movq [edi+48], mm6
  472. movq [edi+56], mm7
  473. ret
  474. }
  475. }
  476. */
  477. /**************************************************************************************
  478. *
  479. * Routine: MMX_idct
  480. *
  481. * Description: Perform IDCT on a 8x8 block
  482. *
  483. * Input: Pointer to input and output buffer
  484. *
  485. * Output: None
  486. *
  487. * Return: None
  488. *
  489. * Special Note: The input coefficients are in ZigZag order
  490. *
  491. * Error: None
  492. *
  493. ***************************************************************************************
  494. */
  495. __declspec ( naked ) void MMX_idct ( INT16 * input, INT16 * qtbl, INT16 * output)
  496. {
  497. // uINT16 *constants = idctconstants;
  498. # define M(I) [ecx + MaskOffset + I*8]
  499. # define C(I) [ecx + CosineOffset + (I-1)*8]
  500. # define Eight [ecx + EightOffset]
  501. # undef Arg
  502. # define Arg(I) [esp + 1*4 + 3*4 + I*4] // 1 return address + 3 pushes prior to args
  503. # define r0 mm0
  504. # define r1 mm1
  505. # define r2 mm2
  506. # define r3 mm3
  507. # define r4 mm4
  508. # define r5 mm5
  509. # define r6 mm6
  510. # define r7 mm7
  511. (void) output;
  512. (void) qtbl;
  513. (void) input;
  514. __asm {
  515. push edx
  516. push ecx
  517. push ebx
  518. ;; Label:
  519. mov eax, Arg( 0) ; eax = quantized input
  520. mov edx, Arg( 2) ; edx = destination (= idct buffer)
  521. mov ecx, [edx] ; (+1 at least) preload the cache before writing
  522. mov ebx, [edx+28] ; in case proc doesn't cache on writes
  523. mov ecx, [edx+56] ; gets all the cache lines
  524. mov ebx, [edx+84] ; regardless of alignment (beyond 32-bit)
  525. mov ecx, [edx+112] ; also avoids address contention stalls
  526. mov ebx, [edx+124]
  527. mov ebx, Arg( 1) ; ebx = quantization table
  528. lea ecx, idctconstants ;;[0];
  529. movq r0, [eax]
  530. ;
  531. pmullw r0, [ebx] ; r0 = 03 02 01 00
  532. ;
  533. movq r1, [eax+16]
  534. ;
  535. pmullw r1, [ebx+16] ; r1 = 13 12 11 10
  536. ;
  537. movq r2, M(0) ; r2 = __ __ __ FF
  538. movq r3, r0 ; r3 = 03 02 01 00
  539. movq r4, [eax+8]
  540. psrlq r0, 16 ; r0 = __ 03 02 01
  541. pmullw r4, [ebx+8] ; r4 = 07 06 05 04
  542. pand r3, r2 ; r3 = __ __ __ 00
  543. movq r5, r0 ; r5 = __ 03 02 01
  544. movq r6, r1 ; r6 = 13 12 11 10
  545. pand r5, r2 ; r5 = __ __ __ 01
  546. psllq r6, 32 ; r6 = 11 10 __ __
  547. movq r7, M(3) ; r7 = FF __ __ __
  548. pxor r0, r5 ; r0 = __ 03 02 __
  549. pand r7, r6 ; r7 = 11 __ __ __
  550. por r0, r3 ; r0 = __ 03 02 00
  551. pxor r6, r7 ; r6 = __ 10 __ __
  552. por r0, r7 ; r0 = 11 03 02 00 = R0
  553. movq r7, M(3) ; r7 = FF __ __ __
  554. movq r3, r4 ; r3 = 07 06 05 04
  555. movq [edx], r0 ; write R0 = r0
  556. pand r3, r2 ; r3 = __ __ __ 04
  557. movq r0, [eax+32]
  558. psllq r3, 16 ; r3 = __ __ 04 __
  559. pmullw r0, [ebx+32] ; r0 = 23 22 21 20
  560. pand r7, r1 ; r7 = 13 __ __ __
  561. por r5, r3 ; r5 = __ __ 04 01
  562. por r7, r6 ; r7 = 13 10 __ __
  563. movq r3, [eax+24]
  564. por r7, r5 ; r7 = 13 10 04 01 = R1
  565. pmullw r3, [ebx+24] ; r3 = 17 16 15 14
  566. psrlq r4, 16 ; r4 = __ 07 06 05
  567. movq [edx+16], r7 ; write R1 = r7
  568. movq r5, r4 ; r5 = __ 07 06 05
  569. movq r7, r0 ; r7 = 23 22 21 20
  570. psrlq r4, 16 ; r4 = __ __ 07 06
  571. psrlq r7, 48 ; r7 = __ __ __ 23
  572. movq r6, r2 ; r6 = __ __ __ FF
  573. pand r5, r2 ; r5 = __ __ __ 05
  574. pand r6, r4 ; r6 = __ __ __ 06
  575. movq [edx+80], r7 ; partial R9 = __ __ __ 23
  576. pxor r4, r6 ; r4 = __ __ 07 __
  577. psrlq r1, 32 ; r1 = __ __ 13 12
  578. por r4, r5 ; r4 = __ __ 07 05
  579. movq r7, M(3) ; r7 = FF __ __ __
  580. pand r1, r2 ; r1 = __ __ __ 12
  581. movq r5, [eax+48]
  582. psllq r0, 16 ; r0 = 22 21 20 __
  583. pmullw r5, [ebx+48] ; r5 = 33 32 31 30
  584. pand r7, r0 ; r7 = 22 __ __ __
  585. movq [edx+64], r1 ; partial R8 = __ __ __ 12
  586. por r7, r4 ; r7 = 22 __ 07 05
  587. movq r4, r3 ; r4 = 17 16 15 14
  588. pand r3, r2 ; r3 = __ __ __ 14
  589. movq r1, M(2) ; r1 = __ FF __ __
  590. psllq r3, 32 ; r3 = __ 14 __ __
  591. por r7, r3 ; r7 = 22 14 07 05 = R2
  592. movq r3, r5 ; r3 = 33 32 31 30
  593. psllq r3, 48 ; r3 = 30 __ __ __
  594. pand r1, r0 ; r1 = __ 21 __ __
  595. movq [edx+32], r7 ; write R2 = r7
  596. por r6, r3 ; r6 = 30 __ __ 06
  597. movq r7, M(1) ; r7 = __ __ FF __
  598. por r6, r1 ; r6 = 30 21 __ 06
  599. movq r1, [eax+56]
  600. pand r7, r4 ; r7 = __ __ 15 __
  601. pmullw r1, [ebx+56] ; r1 = 37 36 35 34
  602. por r7, r6 ; r7 = 30 21 15 06 = R3
  603. pand r0, M(1) ; r0 = __ __ 20 __
  604. psrlq r4, 32 ; r4 = __ __ 17 16
  605. movq [edx+48], r7 ; write R3 = r7
  606. movq r6, r4 ; r6 = __ __ 17 16
  607. movq r7, M(3) ; r7 = FF __ __ __
  608. pand r4, r2 ; r4 = __ __ __ 16
  609. movq r3, M(1) ; r3 = __ __ FF __
  610. pand r7, r1 ; r7 = 37 __ __ __
  611. pand r3, r5 ; r3 = __ __ 31 __
  612. por r0, r4 ; r0 = __ __ 20 16
  613. psllq r3, 16 ; r3 = __ 31 __ __
  614. por r7, r0 ; r7 = 37 __ 20 16
  615. movq r4, M(2) ; r4 = __ FF __ __
  616. por r7, r3 ; r7 = 37 31 20 16 = R4
  617. movq r0, [eax+80]
  618. movq r3, r4 ; r3 = __ __ FF __
  619. pmullw r0, [ebx+80] ; r0 = 53 52 51 50
  620. pand r4, r5 ; r4 = __ 32 __ __
  621. movq [edx+8], r7 ; write R4 = r7
  622. por r6, r4 ; r6 = __ 32 17 16
  623. movq r4, r3 ; r4 = __ FF __ __
  624. psrlq r6, 16 ; r6 = __ __ 32 17
  625. movq r7, r0 ; r7 = 53 52 51 50
  626. pand r4, r1 ; r4 = __ 36 __ __
  627. psllq r7, 48 ; r7 = 50 __ __ __
  628. por r6, r4 ; r6 = __ 36 32 17
  629. movq r4, [eax+88]
  630. por r7, r6 ; r7 = 50 36 32 17 = R5
  631. pmullw r4, [ebx+88] ; r4 = 57 56 55 54
  632. psrlq r3, 16 ; r3 = __ __ FF __
  633. movq [edx+24], r7 ; write R5 = r7
  634. pand r3, r1 ; r3 = __ __ 35 __
  635. psrlq r5, 48 ; r5 = __ __ __ 33
  636. pand r1, r2 ; r1 = __ __ __ 34
  637. movq r6, [eax+104]
  638. por r5, r3 ; r5 = __ __ 35 33
  639. pmullw r6, [ebx+104] ; r6 = 67 66 65 64
  640. psrlq r0, 16 ; r0 = __ 53 52 51
  641. movq r7, r4 ; r7 = 57 56 55 54
  642. movq r3, r2 ; r3 = __ __ __ FF
  643. psllq r7, 48 ; r7 = 54 __ __ __
  644. pand r3, r0 ; r3 = __ __ __ 51
  645. pxor r0, r3 ; r0 = __ 53 52 __
  646. psllq r3, 32 ; r3 = __ 51 __ __
  647. por r7, r5 ; r7 = 54 __ 35 33
  648. movq r5, r6 ; r5 = 67 66 65 64
  649. pand r6, M(1) ; r6 = __ __ 65 __
  650. por r7, r3 ; r7 = 54 51 35 33 = R6
  651. psllq r6, 32 ; r6 = 65 __ __ __
  652. por r0, r1 ; r0 = __ 53 52 34
  653. movq [edx+40], r7 ; write R6 = r7
  654. por r0, r6 ; r0 = 65 53 52 34 = R7
  655. movq r7, [eax+120]
  656. movq r6, r5 ; r6 = 67 66 65 64
  657. pmullw r7, [ebx+120] ; r7 = 77 76 75 74
  658. psrlq r5, 32 ; r5 = __ __ 67 66
  659. pand r6, r2 ; r6 = __ __ __ 64
  660. movq r1, r5 ; r1 = __ __ 67 66
  661. movq [edx+56], r0 ; write R7 = r0
  662. pand r1, r2 ; r1 = __ __ __ 66
  663. movq r0, [eax+112]
  664. movq r3, r7 ; r3 = 77 76 75 74
  665. pmullw r0, [ebx+112] ; r0 = 73 72 71 70
  666. psllq r3, 16 ; r3 = 76 75 74 __
  667. pand r7, M(3) ; r7 = 77 __ __ __
  668. pxor r5, r1 ; r5 = __ __ 67 __
  669. por r6, r5 ; r6 = __ __ 67 64
  670. movq r5, r3 ; r5 = 76 75 74 __
  671. pand r5, M(3) ; r5 = 76 __ __ __
  672. por r7, r1 ; r7 = 77 __ __ 66
  673. movq r1, [eax+96]
  674. pxor r3, r5 ; r3 = __ 75 74 __
  675. pmullw r1, [ebx+96] ; r1 = 63 62 61 60
  676. por r7, r3 ; r7 = 77 75 74 66 = R15
  677. por r6, r5 ; r6 = 76 __ 67 64
  678. movq r5, r0 ; r5 = 73 72 71 70
  679. movq [edx+120], r7 ; store R15 = r7
  680. psrlq r5, 16 ; r5 = __ 73 72 71
  681. pand r5, M(2) ; r5 = __ 73 __ __
  682. movq r7, r0 ; r7 = 73 72 71 70
  683. por r6, r5 ; r6 = 76 73 67 64 = R14
  684. pand r0, r2 ; r0 = __ __ __ 70
  685. pxor r7, r0 ; r7 = 73 72 71 __
  686. psllq r0, 32 ; r0 = __ 70 __ __
  687. movq [edx+104], r6 ; write R14 = r6
  688. psrlq r4, 16 ; r4 = __ 57 56 55
  689. movq r5, [eax+72]
  690. psllq r7, 16 ; r7 = 72 71 __ __
  691. pmullw r5, [ebx+72] ; r5 = 47 46 45 44
  692. movq r6, r7 ; r6 = 72 71 __ __
  693. movq r3, M(2) ; r3 = __ FF __ __
  694. psllq r6, 16 ; r6 = 71 __ __ __
  695. pand r7, M(3) ; r7 = 72 __ __ __
  696. pand r3, r1 ; r3 = __ 62 __ __
  697. por r7, r0 ; r7 = 72 70 __ __
  698. movq r0, r1 ; r0 = 63 62 61 60
  699. pand r1, M(3) ; r1 = 63 __ __ __
  700. por r6, r3 ; r6 = 71 62 __ __
  701. movq r3, r4 ; r3 = __ 57 56 55
  702. psrlq r1, 32 ; r1 = __ __ 63 __
  703. pand r3, r2 ; r3 = __ __ __ 55
  704. por r7, r1 ; r7 = 72 70 63 __
  705. por r7, r3 ; r7 = 72 70 63 55 = R13
  706. movq r3, r4 ; r3 = __ 57 56 55
  707. pand r3, M(1) ; r3 = __ __ 56 __
  708. movq r1, r5 ; r1 = 47 46 45 44
  709. movq [edx+88], r7 ; write R13 = r7
  710. psrlq r5, 48 ; r5 = __ __ __ 47
  711. movq r7, [eax+64]
  712. por r6, r3 ; r6 = 71 62 56 __
  713. pmullw r7, [ebx+64] ; r7 = 43 42 41 40
  714. por r6, r5 ; r6 = 71 62 56 47 = R12
  715. pand r4, M(2) ; r4 = __ 57 __ __
  716. psllq r0, 32 ; r0 = 61 60 __ __
  717. movq [edx+72], r6 ; write R12 = r6
  718. movq r6, r0 ; r6 = 61 60 __ __
  719. pand r0, M(3) ; r0 = 61 __ __ __
  720. psllq r6, 16 ; r6 = 60 __ __ __
  721. movq r5, [eax+40]
  722. movq r3, r1 ; r3 = 47 46 45 44
  723. pmullw r5, [ebx+40] ; r5 = 27 26 25 24
  724. psrlq r1, 16 ; r1 = __ 47 46 45
  725. pand r1, M(1) ; r1 = __ __ 46 __
  726. por r0, r4 ; r0 = 61 57 __ __
  727. pand r2, r7 ; r2 = __ __ __ 40
  728. por r0, r1 ; r0 = 61 57 46 __
  729. por r0, r2 ; r0 = 61 57 46 40 = R11
  730. psllq r3, 16 ; r3 = 46 45 44 __
  731. movq r4, r3 ; r4 = 46 45 44 __
  732. movq r2, r5 ; r2 = 27 26 25 24
  733. movq [edx+112], r0 ; write R11 = r0
  734. psrlq r2, 48 ; r2 = __ __ __ 27
  735. pand r4, M(2) ; r4 = __ 45 __ __
  736. por r6, r2 ; r6 = 60 __ __ 27
  737. movq r2, M(1) ; r2 = __ __ FF __
  738. por r6, r4 ; r6 = 60 45 __ 27
  739. pand r2, r7 ; r2 = __ __ 41 __
  740. psllq r3, 32 ; r3 = 44 __ __ __
  741. por r3, [edx+80] ; r3 = 44 __ __ 23
  742. por r6, r2 ; r6 = 60 45 41 27 = R10
  743. movq r2, M(3) ; r2 = FF __ __ __
  744. psllq r5, 16 ; r5 = 26 25 24 __
  745. movq [edx+96], r6 ; store R10 = r6
  746. pand r2, r5 ; r2 = 26 __ __ __
  747. movq r6, M(2) ; r6 = __ FF __ __
  748. pxor r5, r2 ; r5 = __ 25 24 __
  749. pand r6, r7 ; r6 = __ 42 __ __
  750. psrlq r2, 32 ; r2 = __ __ 26 __
  751. pand r7, M(3) ; r7 = 43 __ __ __
  752. por r3, r2 ; r3 = 44 __ 26 23
  753. por r7, [edx+64] ; r7 = 43 __ __ 12
  754. por r6, r3 ; r6 = 44 42 26 23 = R9
  755. por r7, r5 ; r7 = 43 25 24 12 = R8
  756. ;
  757. movq [edx+80], r6 ; store R9 = r6
  758. ;
  759. movq [edx+64], r7 ; store R8 = r7
  760. ;
  761. ; 123c ( / 64 coeffs < 2c / coeff)
  762. # undef M
  763. ; Done w/dequant + descramble + partial transpose; now do the idct itself.
  764. # define I( K) [edx + ( K * 16)]
  765. # define J( K) [edx + ( (K - 4) * 16) + 8]
  766. RowIDCT ; 46 c
  767. Transpose ; 19 c
  768. # undef I
  769. # undef J
  770. # define I( K) [edx + ( K * 16) + 64]
  771. # define J( K) [edx + ( (K - 4) * 16) + 72]
  772. RowIDCT ; 46 c
  773. Transpose ; 19 c
  774. # undef I
  775. # undef J
  776. # define I( K) [edx + (K * 16)]
  777. # define J( K) I( K)
  778. ColumnIDCT ; 57 c
  779. # undef I
  780. # undef J
  781. # define I( K) [edx + (K * 16) + 8]
  782. # define J( K) I( K)
  783. ColumnIDCT ; 57 c
  784. # undef I
  785. # undef J
  786. pop ebx
  787. pop ecx
  788. pop edx
  789. ret
  790. ; 368 cycles ( / 64 coeff < 6 c / coeff)
  791. }
  792. }
  793. /**************************************************************************************
  794. *
  795. * Routine: MMX_idct10
  796. *
  797. * Description: Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
  798. *
  799. * Input: Pointer to input and output buffer
  800. *
  801. * Output: None
  802. *
  803. * Return: None
  804. *
  805. * Special Note: The input coefficients are in transposed ZigZag order
  806. *
  807. * Error: None
  808. *
  809. ***************************************************************************************
  810. */
  811. /* --------------------------------------------------------------- */
  812. // This macro does four 4-sample one-dimensional idcts in parallel. Inputs
  813. // 4 thru 7 are assumed to be zero.
  814. #define BeginIDCT_10 __asm { \
  815. \
  816. __asm movq r2, I(3) \
  817. __asm nop \
  818. \
  819. __asm movq r6, C(3) \
  820. __asm movq r4, r2 \
  821. \
  822. __asm movq r1, C(5) \
  823. __asm pmulhw r4, r6 /* r4 = c3*i3 - i3 */ \
  824. \
  825. __asm movq r3, I(1) \
  826. __asm pmulhw r1, r2 /* r1 = c5*i3 - i3 */ \
  827. \
  828. __asm movq r0, C(1) /* (all registers are in use) */ \
  829. __asm paddw r4, r2 /* r4 = C = c3*i3 */ \
  830. \
  831. __asm pxor r6,r6 /* used to get -(c5*i3) */ \
  832. __asm paddw r2, r1 /* r2 = c5*i3 */ \
  833. \
  834. __asm movq r5, I(2) \
  835. __asm pmulhw r0, r3 /* r0 = c1*i1 - i1 */ \
  836. \
  837. __asm movq r1, r5 \
  838. __asm paddw r0, r3 /* r0 = A = c1*i1 */ \
  839. \
  840. __asm pmulhw r3, C(7) /* r3 = B = c7*i1 */ \
  841. __asm psubsw r6, r2 /* r6 = D = -c5*i3 */ \
  842. \
  843. __asm pmulhw r5, C(2) /* r1 = c2*i2 - i2 */ \
  844. __asm psubsw r0, r4 /* r0 = A - C */ \
  845. \
  846. __asm movq r7,I(2) \
  847. __asm paddsw r4, r4 /* r4 = C + C */ \
  848. \
  849. __asm paddw r7, r5 /* r7 = G = c2*i2 */ \
  850. __asm paddsw r4, r0 /* r4 = C. = A + C */ \
  851. \
  852. __asm pmulhw r1, C(6) /* r1 = H = c6*i2 */ \
  853. __asm psubsw r3, r6 /* r3 = B - D */ \
  854. \
  855. __asm movq I(1), r4 /* save C. at I(1) */ \
  856. __asm paddsw r6, r6 /* r6 = D + D */ \
  857. \
  858. __asm movq r4, C(4) \
  859. __asm paddsw r6, r3 /* r6 = D. = B + D */ \
  860. \
  861. __asm movq r5, r3 /* r5 = B - D */ \
  862. __asm pmulhw r3, r4 /* r3 = (c4 - 1) * (B - D) */ \
  863. \
  864. __asm movq I(2), r6 /* save D. at I(2) */ \
  865. __asm movq r2, r0 /* r2 = A - C */ \
  866. \
  867. __asm movq r6, I(0) \
  868. __asm pmulhw r0, r4 /* r0 = (c4 - 1) * (A - C) */ \
  869. \
  870. __asm paddw r5, r3 /* r5 = B. = c4 * (B - D) */ \
  871. __asm paddw r2, r0 /* r0 = A. = c4 * (A - C) */ \
  872. \
  873. __asm psubsw r5, r1 /* r5 = B.. = B. - H */ \
  874. __asm pmulhw r6, r4 /* r6 = c4*i0 - i0 */ \
  875. \
  876. __asm paddw r6, I(0) /* r6 = E = c4*i0 */ \
  877. __asm paddsw r1, r1 /* r1 = H + H */ \
  878. \
  879. __asm movq r4, r6 /* r4 = E */ \
  880. __asm paddsw r1, r5 /* r1 = H. = B + H */ \
  881. \
  882. __asm psubsw r6, r2 /* r6 = F. = E - A. */ \
  883. __asm paddsw r2, r2 /* r2 = A. + A. */ \
  884. \
  885. __asm movq r0, I(1) /* r0 = C. */ \
  886. __asm paddsw r2, r6 /* r2 = A.. = E + A. */ \
  887. \
  888. __asm psubsw r2, r1 /* r2 = R2 = A.. - H. */ \
  889. __asm nop \
  890. }
  891. // end BeginIDCT_10 macro (25 cycles).
  892. #define RowIDCT_10 __asm { \
  893. \
  894. BeginIDCT_10 \
  895. \
  896. __asm movq r3, I(2) /* r3 = D. */ \
  897. __asm psubsw r4, r7 /* r4 = E. = E - G */ \
  898. __asm paddsw r1, r1 /* r1 = H. + H. */ \
  899. __asm paddsw r7, r7 /* r7 = G + G */ \
  900. __asm paddsw r1, r2 /* r1 = R1 = A.. + H. */ \
  901. __asm paddsw r7, r4 /* r7 = G. = E + G */ \
  902. __asm psubsw r4, r3 /* r4 = R4 = E. - D. */ \
  903. __asm paddsw r3, r3 \
  904. __asm psubsw r6, r5 /* r6 = R6 = F. - B.. */ \
  905. __asm paddsw r5, r5 \
  906. __asm paddsw r3, r4 /* r3 = R3 = E. + D. */ \
  907. __asm paddsw r5, r6 /* r5 = R5 = F. + B.. */ \
  908. __asm psubsw r7, r0 /* r7 = R7 = G. - C. */ \
  909. __asm paddsw r0, r0 \
  910. __asm movq I(1), r1 /* save R1 */ \
  911. __asm paddsw r0, r7 /* r0 = R0 = G. + C. */ \
  912. }
  913. // end RowIDCT macro (8 + 38 = 46 cycles)
  914. // Column IDCT normalizes and stores final results.
  915. #define ColumnIDCT_10 __asm { \
  916. \
  917. BeginIDCT_10 \
  918. \
  919. __asm paddsw r2, Eight /* adjust R2 (and R1) for shift */ \
  920. __asm paddsw r1, r1 /* r1 = H. + H. */ \
  921. __asm paddsw r1, r2 /* r1 = R1 = A.. + H. */ \
  922. __asm psraw r2, 4 /* r2 = NR2 */ \
  923. __asm psubsw r4, r7 /* r4 = E. = E - G */ \
  924. __asm psraw r1, 4 /* r1 = NR1 */ \
  925. __asm movq r3, I(2) /* r3 = D. */ \
  926. __asm paddsw r7, r7 /* r7 = G + G */ \
  927. __asm movq I(2), r2 /* store NR2 at I2 */ \
  928. __asm paddsw r7, r4 /* r7 = G. = E + G */ \
  929. __asm movq I(1), r1 /* store NR1 at I1 */ \
  930. __asm psubsw r4, r3 /* r4 = R4 = E. - D. */ \
  931. __asm paddsw r4, Eight /* adjust R4 (and R3) for shift */ \
  932. __asm paddsw r3, r3 /* r3 = D. + D. */ \
  933. __asm paddsw r3, r4 /* r3 = R3 = E. + D. */ \
  934. __asm psraw r4, 4 /* r4 = NR4 */ \
  935. __asm psubsw r6, r5 /* r6 = R6 = F. - B.. */ \
  936. __asm psraw r3, 4 /* r3 = NR3 */ \
  937. __asm paddsw r6, Eight /* adjust R6 (and R5) for shift */ \
  938. __asm paddsw r5, r5 /* r5 = B.. + B.. */ \
  939. __asm paddsw r5, r6 /* r5 = R5 = F. + B.. */ \
  940. __asm psraw r6, 4 /* r6 = NR6 */ \
  941. __asm movq J(4), r4 /* store NR4 at J4 */ \
  942. __asm psraw r5, 4 /* r5 = NR5 */ \
  943. __asm movq I(3), r3 /* store NR3 at I3 */ \
  944. __asm psubsw r7, r0 /* r7 = R7 = G. - C. */ \
  945. __asm paddsw r7, Eight /* adjust R7 (and R0) for shift */ \
  946. __asm paddsw r0, r0 /* r0 = C. + C. */ \
  947. __asm paddsw r0, r7 /* r0 = R0 = G. + C. */ \
  948. __asm psraw r7, 4 /* r7 = NR7 */ \
  949. __asm movq J(6), r6 /* store NR6 at J6 */ \
  950. __asm psraw r0, 4 /* r0 = NR0 */ \
  951. __asm movq J(5), r5 /* store NR5 at J5 */ \
  952. \
  953. __asm movq J(7), r7 /* store NR7 at J7 */ \
  954. \
  955. __asm movq I(0), r0 /* store NR0 at I0 */ \
  956. \
  957. }
  958. // end ColumnIDCT macro (38 + 19 = 57 cycles)
  959. /* --------------------------------------------------------------- */
  960. /* --------------------------------------------------------------- */
  961. /* IDCT 10 */
  962. __declspec ( naked ) void MMX_idct10 ( INT16 * input, INT16 * qtbl, INT16 * output)
  963. {
  964. # define M(I) [ecx + MaskOffset + I*8]
  965. # define C(I) [ecx + CosineOffset + (I-1)*8]
  966. # define Eight [ecx + EightOffset]
  967. # undef Arg
  968. # define Arg(I) [esp + 16 + I*4]
  969. # define r0 mm0
  970. # define r1 mm1
  971. # define r2 mm2
  972. # define r3 mm3
  973. # define r4 mm4
  974. # define r5 mm5
  975. # define r6 mm6
  976. # define r7 mm7
  977. (void) output;
  978. (void) qtbl;
  979. (void) input;
  980. __asm {
  981. push edx
  982. push ecx
  983. push ebx
  984. // Label:
  985. mov eax, Arg( 0) ; eax = quantized input
  986. mov edx, Arg( 2) ; edx = destination (= idct buffer)
  987. mov ecx, [edx] ; (+1 at least) preload the cache before writing
  988. mov ebx, [edx+28] ; in case proc doesn't cache on writes
  989. mov ecx, [edx+56] ; gets all the cache lines
  990. mov ebx, [edx+84] ; regardless of alignment (beyond 32-bit)
  991. mov ecx, [edx+112] ; also avoids address contention stalls
  992. mov ebx, [edx+124]
  993. mov ebx, Arg( 1) ; ebx = quantization table
  994. lea ecx, idctconstants ;; [0];
  995. movq r0, [eax]
  996. ;
  997. pmullw r0, [ebx] ; r0 = 03 02 01 00
  998. ;
  999. movq r1, [eax+16]
  1000. ;
  1001. pmullw r1, [ebx+16] ; r1 = 13 12 11 10
  1002. ;
  1003. movq r2, M(0) ; r2 = __ __ __ FF
  1004. movq r3, r0 ; r3 = 03 02 01 00
  1005. movq r4, [eax+8]
  1006. psrlq r0, 16 ; r0 = __ 03 02 01
  1007. pmullw r4, [ebx+8] ; r4 = 07 06 05 04
  1008. pand r3, r2 ; r3 = __ __ __ 00
  1009. movq r5, r0 ; r5 = __ 03 02 01
  1010. movq r6, r1 ; r6 = 13 12 11 10
  1011. pand r5, r2 ; r5 = __ __ __ 01
  1012. psllq r6, 32 ; r6 = 11 10 __ __
  1013. movq r7, M(3) ; r7 = FF __ __ __
  1014. pxor r0, r5 ; r0 = __ 03 02 __
  1015. pand r7, r6 ; r7 = 11 __ __ __
  1016. por r0, r3 ; r0 = __ 03 02 00
  1017. pxor r6, r7 ; r6 = __ 10 __ __
  1018. por r0, r7 ; r0 = 11 03 02 00 = R0
  1019. movq r7, M(3) ; r7 = FF __ __ __
  1020. movq r3, r4 ; r3 = 07 06 05 04
  1021. movq [edx], r0 ; write R0 = r0
  1022. pand r3, r2 ; r3 = __ __ __ 04
  1023. movq r0, [eax+32]
  1024. psllq r3, 16 ; r3 = __ __ 04 __
  1025. pmullw r0, [ebx+32] ; r0 = 23 22 21 20
  1026. pand r7, r1 ; r7 = 13 __ __ __
  1027. por r5, r3 ; r5 = __ __ 04 01
  1028. por r7, r6 ; r7 = 13 10 __ __
  1029. movq r3, [eax+24]
  1030. por r7, r5 ; r7 = 13 10 04 01 = R1
  1031. pmullw r3, [ebx+24] ; r3 = 17 16 15 14
  1032. psrlq r4, 16 ; r4 = __ 07 06 05
  1033. movq [edx+16], r7 ; write R1 = r7
  1034. movq r5, r4 ; r5 = __ 07 06 05
  1035. movq r7, r0 ; r7 = 23 22 21 20
  1036. psrlq r4, 16 ; r4 = __ __ 07 06
  1037. psrlq r7, 48 ; r7 = __ __ __ 23
  1038. movq r6, r2 ; r6 = __ __ __ FF
  1039. pand r5, r2 ; r5 = __ __ __ 05
  1040. pand r6, r4 ; r6 = __ __ __ 06
  1041. movq [edx+80], r7 ; partial R9 = __ __ __ 23
  1042. pxor r4, r6 ; r4 = __ __ 07 __
  1043. psrlq r1, 32 ; r1 = __ __ 13 12
  1044. por r4, r5 ; r4 = __ __ 07 05
  1045. movq r7, M(3) ; r7 = FF __ __ __
  1046. pand r1, r2 ; r1 = __ __ __ 12
  1047. movq r5, [eax+48]
  1048. psllq r0, 16 ; r0 = 22 21 20 __
  1049. pmullw r5, [ebx+48] ; r5 = 33 32 31 30
  1050. pand r7, r0 ; r7 = 22 __ __ __
  1051. movq [edx+64], r1 ; partial R8 = __ __ __ 12
  1052. por r7, r4 ; r7 = 22 __ 07 05
  1053. movq r4, r3 ; r4 = 17 16 15 14
  1054. pand r3, r2 ; r3 = __ __ __ 14
  1055. movq r1, M(2) ; r1 = __ FF __ __
  1056. psllq r3, 32 ; r3 = __ 14 __ __
  1057. por r7, r3 ; r7 = 22 14 07 05 = R2
  1058. movq r3, r5 ; r3 = 33 32 31 30
  1059. psllq r3, 48 ; r3 = 30 __ __ __
  1060. pand r1, r0 ; r1 = __ 21 __ __
  1061. movq [edx+32], r7 ; write R2 = r7
  1062. por r6, r3 ; r6 = 30 __ __ 06
  1063. movq r7, M(1) ; r7 = __ __ FF __
  1064. por r6, r1 ; r6 = 30 21 __ 06
  1065. movq r1, [eax+56]
  1066. pand r7, r4 ; r7 = __ __ 15 __
  1067. pmullw r1, [ebx+56] ; r1 = 37 36 35 34
  1068. por r7, r6 ; r7 = 30 21 15 06 = R3
  1069. pand r0, M(1) ; r0 = __ __ 20 __
  1070. psrlq r4, 32 ; r4 = __ __ 17 16
  1071. movq [edx+48], r7 ; write R3 = r7
  1072. movq r6, r4 ; r6 = __ __ 17 16
  1073. movq r7, M(3) ; r7 = FF __ __ __
  1074. pand r4, r2 ; r4 = __ __ __ 16
  1075. movq r3, M(1) ; r3 = __ __ FF __
  1076. pand r7, r1 ; r7 = 37 __ __ __
  1077. pand r3, r5 ; r3 = __ __ 31 __
  1078. por r0, r4 ; r0 = __ __ 20 16
  1079. psllq r3, 16 ; r3 = __ 31 __ __
  1080. por r7, r0 ; r7 = 37 __ 20 16
  1081. movq r4, M(2) ; r4 = __ FF __ __
  1082. por r7, r3 ; r7 = 37 31 20 16 = R4
  1083. movq r0, [eax+80]
  1084. movq r3, r4 ; r3 = __ __ FF __
  1085. pmullw r0, [ebx+80] ; r0 = 53 52 51 50
  1086. pand r4, r5 ; r4 = __ 32 __ __
  1087. movq [edx+8], r7 ; write R4 = r7
  1088. por r6, r4 ; r6 = __ 32 17 16
  1089. movq r4, r3 ; r4 = __ FF __ __
  1090. psrlq r6, 16 ; r6 = __ __ 32 17
  1091. movq r7, r0 ; r7 = 53 52 51 50
  1092. pand r4, r1 ; r4 = __ 36 __ __
  1093. psllq r7, 48 ; r7 = 50 __ __ __
  1094. por r6, r4 ; r6 = __ 36 32 17
  1095. movq r4, [eax+88]
  1096. por r7, r6 ; r7 = 50 36 32 17 = R5
  1097. pmullw r4, [ebx+88] ; r4 = 57 56 55 54
  1098. psrlq r3, 16 ; r3 = __ __ FF __
  1099. movq [edx+24], r7 ; write R5 = r7
  1100. pand r3, r1 ; r3 = __ __ 35 __
  1101. psrlq r5, 48 ; r5 = __ __ __ 33
  1102. pand r1, r2 ; r1 = __ __ __ 34
  1103. movq r6, [eax+104]
  1104. por r5, r3 ; r5 = __ __ 35 33
  1105. pmullw r6, [ebx+104] ; r6 = 67 66 65 64
  1106. psrlq r0, 16 ; r0 = __ 53 52 51
  1107. movq r7, r4 ; r7 = 57 56 55 54
  1108. movq r3, r2 ; r3 = __ __ __ FF
  1109. psllq r7, 48 ; r7 = 54 __ __ __
  1110. pand r3, r0 ; r3 = __ __ __ 51
  1111. pxor r0, r3 ; r0 = __ 53 52 __
  1112. psllq r3, 32 ; r3 = __ 51 __ __
  1113. por r7, r5 ; r7 = 54 __ 35 33
  1114. movq r5, r6 ; r5 = 67 66 65 64
  1115. pand r6, M(1) ; r6 = __ __ 65 __
  1116. por r7, r3 ; r7 = 54 51 35 33 = R6
  1117. psllq r6, 32 ; r6 = 65 __ __ __
  1118. por r0, r1 ; r0 = __ 53 52 34
  1119. movq [edx+40], r7 ; write R6 = r7
  1120. por r0, r6 ; r0 = 65 53 52 34 = R7
  1121. movq r7, [eax+120]
  1122. movq r6, r5 ; r6 = 67 66 65 64
  1123. pmullw r7, [ebx+120] ; r7 = 77 76 75 74
  1124. psrlq r5, 32 ; r5 = __ __ 67 66
  1125. pand r6, r2 ; r6 = __ __ __ 64
  1126. movq r1, r5 ; r1 = __ __ 67 66
  1127. movq [edx+56], r0 ; write R7 = r0
  1128. pand r1, r2 ; r1 = __ __ __ 66
  1129. movq r0, [eax+112]
  1130. movq r3, r7 ; r3 = 77 76 75 74
  1131. pmullw r0, [ebx+112] ; r0 = 73 72 71 70
  1132. psllq r3, 16 ; r3 = 76 75 74 __
  1133. pand r7, M(3) ; r7 = 77 __ __ __
  1134. pxor r5, r1 ; r5 = __ __ 67 __
  1135. por r6, r5 ; r6 = __ __ 67 64
  1136. movq r5, r3 ; r5 = 76 75 74 __
  1137. pand r5, M(3) ; r5 = 76 __ __ __
  1138. por r7, r1 ; r7 = 77 __ __ 66
  1139. movq r1, [eax+96]
  1140. pxor r3, r5 ; r3 = __ 75 74 __
  1141. pmullw r1, [ebx+96] ; r1 = 63 62 61 60
  1142. por r7, r3 ; r7 = 77 75 74 66 = R15
  1143. por r6, r5 ; r6 = 76 __ 67 64
  1144. movq r5, r0 ; r5 = 73 72 71 70
  1145. movq [edx+120], r7 ; store R15 = r7
  1146. psrlq r5, 16 ; r5 = __ 73 72 71
  1147. pand r5, M(2) ; r5 = __ 73 __ __
  1148. movq r7, r0 ; r7 = 73 72 71 70
  1149. por r6, r5 ; r6 = 76 73 67 64 = R14
  1150. pand r0, r2 ; r0 = __ __ __ 70
  1151. pxor r7, r0 ; r7 = 73 72 71 __
  1152. psllq r0, 32 ; r0 = __ 70 __ __
  1153. movq [edx+104], r6 ; write R14 = r6
  1154. psrlq r4, 16 ; r4 = __ 57 56 55
  1155. movq r5, [eax+72]
  1156. psllq r7, 16 ; r7 = 72 71 __ __
  1157. pmullw r5, [ebx+72] ; r5 = 47 46 45 44
  1158. movq r6, r7 ; r6 = 72 71 __ __
  1159. movq r3, M(2) ; r3 = __ FF __ __
  1160. psllq r6, 16 ; r6 = 71 __ __ __
  1161. pand r7, M(3) ; r7 = 72 __ __ __
  1162. pand r3, r1 ; r3 = __ 62 __ __
  1163. por r7, r0 ; r7 = 72 70 __ __
  1164. movq r0, r1 ; r0 = 63 62 61 60
  1165. pand r1, M(3) ; r1 = 63 __ __ __
  1166. por r6, r3 ; r6 = 71 62 __ __
  1167. movq r3, r4 ; r3 = __ 57 56 55
  1168. psrlq r1, 32 ; r1 = __ __ 63 __
  1169. pand r3, r2 ; r3 = __ __ __ 55
  1170. por r7, r1 ; r7 = 72 70 63 __
  1171. por r7, r3 ; r7 = 72 70 63 55 = R13
  1172. movq r3, r4 ; r3 = __ 57 56 55
  1173. pand r3, M(1) ; r3 = __ __ 56 __
  1174. movq r1, r5 ; r1 = 47 46 45 44
  1175. movq [edx+88], r7 ; write R13 = r7
  1176. psrlq r5, 48 ; r5 = __ __ __ 47
  1177. movq r7, [eax+64]
  1178. por r6, r3 ; r6 = 71 62 56 __
  1179. pmullw r7, [ebx+64] ; r7 = 43 42 41 40
  1180. por r6, r5 ; r6 = 71 62 56 47 = R12
  1181. pand r4, M(2) ; r4 = __ 57 __ __
  1182. psllq r0, 32 ; r0 = 61 60 __ __
  1183. movq [edx+72], r6 ; write R12 = r6
  1184. movq r6, r0 ; r6 = 61 60 __ __
  1185. pand r0, M(3) ; r0 = 61 __ __ __
  1186. psllq r6, 16 ; r6 = 60 __ __ __
  1187. movq r5, [eax+40]
  1188. movq r3, r1 ; r3 = 47 46 45 44
  1189. pmullw r5, [ebx+40] ; r5 = 27 26 25 24
  1190. psrlq r1, 16 ; r1 = __ 47 46 45
  1191. pand r1, M(1) ; r1 = __ __ 46 __
  1192. por r0, r4 ; r0 = 61 57 __ __
  1193. pand r2, r7 ; r2 = __ __ __ 40
  1194. por r0, r1 ; r0 = 61 57 46 __
  1195. por r0, r2 ; r0 = 61 57 46 40 = R11
  1196. psllq r3, 16 ; r3 = 46 45 44 __
  1197. movq r4, r3 ; r4 = 46 45 44 __
  1198. movq r2, r5 ; r2 = 27 26 25 24
  1199. movq [edx+112], r0 ; write R11 = r0
  1200. psrlq r2, 48 ; r2 = __ __ __ 27
  1201. pand r4, M(2) ; r4 = __ 45 __ __
  1202. por r6, r2 ; r6 = 60 __ __ 27
  1203. movq r2, M(1) ; r2 = __ __ FF __
  1204. por r6, r4 ; r6 = 60 45 __ 27
  1205. pand r2, r7 ; r2 = __ __ 41 __
  1206. psllq r3, 32 ; r3 = 44 __ __ __
  1207. por r3, [edx+80] ; r3 = 44 __ __ 23
  1208. por r6, r2 ; r6 = 60 45 41 27 = R10
  1209. movq r2, M(3) ; r2 = FF __ __ __
  1210. psllq r5, 16 ; r5 = 26 25 24 __
  1211. movq [edx+96], r6 ; store R10 = r6
  1212. pand r2, r5 ; r2 = 26 __ __ __
  1213. movq r6, M(2) ; r6 = __ FF __ __
  1214. pxor r5, r2 ; r5 = __ 25 24 __
  1215. pand r6, r7 ; r6 = __ 42 __ __
  1216. psrlq r2, 32 ; r2 = __ __ 26 __
  1217. pand r7, M(3) ; r7 = 43 __ __ __
  1218. por r3, r2 ; r3 = 44 __ 26 23
  1219. por r7, [edx+64] ; r7 = 43 __ __ 12
  1220. por r6, r3 ; r6 = 44 42 26 23 = R9
  1221. por r7, r5 ; r7 = 43 25 24 12 = R8
  1222. ;
  1223. movq [edx+80], r6 ; store R9 = r6
  1224. ;
  1225. movq [edx+64], r7 ; store R8 = r7
  1226. ;
  1227. ; 123c ( / 64 coeffs < 2c / coeff)
  1228. # undef M
  1229. ; Done w/dequant + descramble + partial transpose; now do the idct itself.
  1230. # define I( K) [edx + ( K * 16)]
  1231. # define J( K) [edx + ( (K - 4) * 16) + 8]
  1232. RowIDCT_10 ; 33 c
  1233. Transpose ; 19 c
  1234. # undef I
  1235. # undef J
  1236. # define I( K) [edx + ( K * 16) + 64]
  1237. # define J( K) [edx + ( (K - 4) * 16) + 72]
  1238. // RowIDCT ; 46 c
  1239. // Transpose ; 19 c
  1240. # undef I
  1241. # undef J
  1242. # define I( K) [edx + (K * 16)]
  1243. # define J( K) I( K)
  1244. ColumnIDCT_10 ; 44 c
  1245. # undef I
  1246. # undef J
  1247. # define I( K) [edx + (K * 16) + 8]
  1248. # define J( K) I( K)
  1249. ColumnIDCT_10 ; 44 c
  1250. # undef I
  1251. # undef J
  1252. pop ebx
  1253. pop ecx
  1254. pop edx
  1255. ret
  1256. }
  1257. }
  1258. /**************************************************************************************
  1259. *
  1260. * Routine: MMX_idct1
  1261. *
  1262. * Description: Perform IDCT on a 8x8 block with at most 1 nonzero coefficients
  1263. *
  1264. * Input: Pointer to input and output buffer
  1265. *
  1266. * Output: None
  1267. *
  1268. * Return: None
  1269. *
  1270. * Special Note: None
  1271. *
  1272. * Error: None
  1273. *
  1274. ***************************************************************************************
  1275. */
  1276. /* --------------------------------------------------------------- */
  1277. /* IDCT 1 */
  1278. void MMX_idct1 (INT16 * input, INT16 * qtbl, INT16 * output)
  1279. {
  1280. if(input[0])
  1281. {
  1282. int i;
  1283. INT32 temp = (INT32)input[0];
  1284. INT32 *iBuf=(INT32 *)output;
  1285. temp *= qtbl[0];
  1286. //necessary in order to match tim's
  1287. temp += 15;
  1288. temp >>= 5;
  1289. temp &= 0xffff;
  1290. temp += temp << 16;
  1291. for(i = 0; i < 32; i += 4)
  1292. {
  1293. iBuf[i] = temp;
  1294. iBuf[i+1] = temp;
  1295. iBuf[i+2] = temp;
  1296. iBuf[i+3] = temp;
  1297. }
  1298. }
  1299. else
  1300. {
  1301. /* special case where there is only a 0 dc coeff */
  1302. memset( output, 0, 128);
  1303. }
  1304. }
  1305. /* --------------------------------------------------------------- */
  1306. /*
  1307. The following functions (MMX_idct_DX and MMX_idct10_DX) are only
  1308. used by the dxer. The coeffs are written into a transposed order
  1309. during the unpack stage.
  1310. */
  1311. /* --------------------------------------------------------------- */
  1312. /**************************************************************************************
  1313. *
  1314. * Routine: MMX_idct_DX
  1315. *
  1316. * Description: Perform IDCT on a 8x8 block
  1317. *
  1318. * Input: Pointer to input and output buffer
  1319. *
  1320. * Output: None
  1321. *
  1322. * Return: None
  1323. *
  1324. * Special Note: The input coefficients are in transposed ZigZag order
  1325. *
  1326. * Error: None
  1327. *
  1328. ***************************************************************************************
  1329. */
  1330. __declspec ( naked ) void MMX_idct_DX ( INT16 * input, INT16 * qtbl, INT16 * output)
  1331. {
  1332. // uINT16 *constants = idctconstants;
  1333. # define M(I) [ecx + MaskOffset + I*8]
  1334. # define C(I) [ecx + CosineOffset + (I-1)*8]
  1335. # define Eight [ecx + EightOffset]
  1336. # undef Arg
  1337. # define Arg(I) [esp + 1*4 + 3*4 + I*4] // 1 return address + 3 pushes prior to args
  1338. # define r0 mm0
  1339. # define r1 mm1
  1340. # define r2 mm2
  1341. # define r3 mm3
  1342. # define r4 mm4
  1343. # define r5 mm5
  1344. # define r6 mm6
  1345. # define r7 mm7
  1346. (void) output;
  1347. (void) qtbl;
  1348. (void) input;
  1349. __asm {
  1350. push edx
  1351. push ecx
  1352. push ebx
  1353. ;; Label:
  1354. mov eax, Arg( 0) ; eax = quantized input
  1355. mov edx, Arg( 2) ; edx = destination (= idct buffer)
  1356. mov ecx, [edx] ; (+1 at least) preload the cache before writing
  1357. mov ebx, [edx+28] ; in case proc doesn't cache on writes
  1358. mov ecx, [edx+56] ; gets all the cache lines
  1359. mov ebx, [edx+84] ; regardless of alignment (beyond 32-bit)
  1360. mov ecx, [edx+112] ; also avoids address contention stalls
  1361. mov ebx, [edx+124]
  1362. mov ebx, Arg( 1) ; ebx = quantization table
  1363. lea ecx, idctconstants ;;[0];
  1364. //dequantization
  1365. //try to optimize better
  1366. movq r0, [eax+0]
  1367. ;
  1368. pmullw r0, [ebx+0] ; r0 = 03 02 01 00
  1369. ;
  1370. movq r1, [eax+8]
  1371. ;
  1372. pmullw r1, [ebx+8]
  1373. ;
  1374. movq r2, [eax+16]
  1375. ;
  1376. pmullw r2, [ebx+16]
  1377. ;
  1378. movq r3, [eax+24]
  1379. ;
  1380. pmullw r3, [ebx+24]
  1381. ;
  1382. movq r4, [eax+32]
  1383. ;
  1384. pmullw r4, [ebx+32]
  1385. ;
  1386. movq r5, [eax+40]
  1387. ;
  1388. pmullw r5, [ebx+40]
  1389. ;
  1390. movq r6, [eax+48]
  1391. ;
  1392. pmullw r6, [ebx+48]
  1393. ;
  1394. movq r7, [eax+56]
  1395. ;
  1396. pmullw r7, [ebx+56]
  1397. ;
  1398. movq [edx+0],r0
  1399. ;
  1400. movq [edx+8],r1
  1401. ;
  1402. movq [edx+16],r2
  1403. ;
  1404. movq [edx+24],r3
  1405. ;
  1406. movq [edx+32],r4
  1407. ;
  1408. movq [edx+40],r5
  1409. ;
  1410. movq [edx+48],r6
  1411. ;
  1412. movq [edx+56],r7
  1413. ;
  1414. ;;;;;;;;;;;
  1415. movq r0, [eax+64]
  1416. ;
  1417. pmullw r0, [ebx+64] ; r0 = 03 02 01 00
  1418. ;
  1419. movq r1, [eax+72]
  1420. ;
  1421. pmullw r1, [ebx+72]
  1422. ;
  1423. movq r2, [eax+80]
  1424. ;
  1425. pmullw r2, [ebx+80]
  1426. ;
  1427. movq r3, [eax+88]
  1428. ;
  1429. pmullw r3, [ebx+88]
  1430. ;
  1431. movq r4, [eax+96]
  1432. ;
  1433. pmullw r4, [ebx+96]
  1434. ;
  1435. movq r5, [eax+104]
  1436. ;
  1437. pmullw r5, [ebx+104]
  1438. ;
  1439. movq r6, [eax+112]
  1440. ;
  1441. pmullw r6, [ebx+112]
  1442. ;
  1443. movq r7, [eax+120]
  1444. ;
  1445. pmullw r7, [ebx+120]
  1446. ;
  1447. movq [edx+64],r0
  1448. ;
  1449. movq [edx+72],r1
  1450. ;
  1451. movq [edx+80],r2
  1452. ;
  1453. movq [edx+88],r3
  1454. ;
  1455. movq [edx+96],r4
  1456. ;
  1457. movq [edx+104],r5
  1458. ;
  1459. movq [edx+112],r6
  1460. ;
  1461. movq [edx+120],r7
  1462. ;
  1463. # undef M
  1464. ; Done w/dequant + descramble + partial transpose; now do the idct itself.
  1465. # define I( K) [edx + ( K * 16)]
  1466. # define J( K) [edx + ( (K - 4) * 16) + 8]
  1467. RowIDCT ; 46 c
  1468. Transpose ; 19 c
  1469. # undef I
  1470. # undef J
  1471. # define I( K) [edx + ( K * 16) + 64]
  1472. # define J( K) [edx + ( (K - 4) * 16) + 72]
  1473. RowIDCT ; 46 c
  1474. Transpose ; 19 c
  1475. # undef I
  1476. # undef J
  1477. # define I( K) [edx + (K * 16)]
  1478. # define J( K) I( K)
  1479. ColumnIDCT ; 57 c
  1480. # undef I
  1481. # undef J
  1482. # define I( K) [edx + (K * 16) + 8]
  1483. # define J( K) I( K)
  1484. ColumnIDCT ; 57 c
  1485. # undef I
  1486. # undef J
  1487. pop ebx
  1488. pop ecx
  1489. pop edx
  1490. ret
  1491. ; 368 cycles ( / 64 coeff < 6 c / coeff)
  1492. }
  1493. }
  1494. /**************************************************************************************
  1495. *
  1496. * Routine: MMX_idct10_DX
  1497. *
  1498. * Description: Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
  1499. *
  1500. * Input: Pointer to input and output buffer
  1501. *
  1502. * Output: None
  1503. *
  1504. * Return: None
  1505. *
  1506. * Special Note: The input coefficients are in transposed ZigZag order
  1507. *
  1508. * Error: None
  1509. *
  1510. ***************************************************************************************
  1511. */
  1512. /* --------------------------------------------------------------- */
  1513. /* IDCT 10 */
  1514. __declspec ( naked ) void MMX_idct10_DX ( INT16 * input, INT16 * qtbl, INT16 * output)
  1515. {
  1516. # define M(I) [ecx + MaskOffset + I*8]
  1517. # define C(I) [ecx + CosineOffset + (I-1)*8]
  1518. # define Eight [ecx + EightOffset]
  1519. # undef Arg
  1520. # define Arg(I) [esp + 16 + I*4]
  1521. # define r0 mm0
  1522. # define r1 mm1
  1523. # define r2 mm2
  1524. # define r3 mm3
  1525. # define r4 mm4
  1526. # define r5 mm5
  1527. # define r6 mm6
  1528. # define r7 mm7
  1529. (void) output;
  1530. (void) qtbl;
  1531. (void) input;
  1532. __asm {
  1533. push edx
  1534. push ecx
  1535. push ebx
  1536. // Label:
  1537. mov eax, Arg( 0) ; eax = quantized input
  1538. mov edx, Arg( 2) ; edx = destination (= idct buffer)
  1539. mov ecx, [edx] ; (+1 at least) preload the cache before writing
  1540. mov ebx, [edx+28] ; in case proc doesn't cache on writes
  1541. mov ecx, [edx+56] ; gets all the cache lines
  1542. mov ebx, [edx+84] ; regardless of alignment (beyond 32-bit)
  1543. mov ecx, [edx+112] ; also avoids address contention stalls
  1544. mov ebx, [edx+124]
  1545. mov ebx, Arg( 1) ; ebx = quantization table
  1546. lea ecx, idctconstants ;; [0];
  1547. //dequantization
  1548. movq r0, [eax+0]
  1549. ;
  1550. pmullw r0, [ebx+0]
  1551. ;
  1552. movq r1, [eax+16]
  1553. ;
  1554. pmullw r1, [ebx+16]
  1555. ;
  1556. movq r2, [eax+32]
  1557. ;
  1558. pmullw r2, [ebx+32]
  1559. ;
  1560. movq r3, [eax+48]
  1561. ;
  1562. pmullw r3, [ebx+48]
  1563. ;
  1564. movq [edx+0],r0
  1565. pxor r5,r5
  1566. movq [edx+8],r5
  1567. ;
  1568. movq [edx+16],r1
  1569. ;
  1570. movq [edx+24],r5
  1571. ;
  1572. movq [edx+32],r2
  1573. ;
  1574. movq [edx+40],r5
  1575. ;
  1576. movq [edx+48],r3
  1577. ;
  1578. movq [edx+56],r5
  1579. ;
  1580. movq [edx+64],r5
  1581. ;
  1582. movq [edx+72],r5
  1583. ;
  1584. movq [edx+80],r5
  1585. ;
  1586. movq [edx+88],r5
  1587. ;
  1588. movq [edx+96],r5
  1589. ;
  1590. movq [edx+104],r5
  1591. ;
  1592. movq [edx+112],r5
  1593. ;
  1594. movq [edx+120],r5
  1595. ;
  1596. # undef M
  1597. ; Done w/dequant + descramble + partial transpose; now do the idct itself.
  1598. # define I( K) [edx + ( K * 16)]
  1599. # define J( K) [edx + ( (K - 4) * 16) + 8]
  1600. RowIDCT_10 ; 33 c
  1601. Transpose ; 19 c
  1602. # undef I
  1603. # undef J
  1604. # define I( K) [edx + ( K * 16) + 64]
  1605. # define J( K) [edx + ( (K - 4) * 16) + 72]
  1606. // RowIDCT ; 46 c
  1607. // Transpose ; 19 c
  1608. # undef I
  1609. # undef J
  1610. # define I( K) [edx + (K * 16)]
  1611. # define J( K) I( K)
  1612. ColumnIDCT_10 ; 44 c
  1613. # undef I
  1614. # undef J
  1615. # define I( K) [edx + (K * 16) + 8]
  1616. # define J( K) I( K)
  1617. ColumnIDCT_10 ; 44 c
  1618. # undef I
  1619. # undef J
  1620. pop ebx
  1621. pop ecx
  1622. pop edx
  1623. ret
  1624. }
  1625. }
  1626. /**************************************************************************************
  1627. *
  1628. * Routine: MMX_idct3
  1629. *
  1630. * Description: Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
  1631. *
  1632. * Input: Pointer to input and output buffer
  1633. *
  1634. * Output: None
  1635. *
  1636. * Return: None
  1637. *
  1638. * Special Note: Only works for three nonzero coefficients.
  1639. *
  1640. * Error: None
  1641. *
  1642. ***************************************************************************************
  1643. */
  1644. /***************************************************************************************
  1645. In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
  1646. In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
  1647. do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
  1648. After row IDCTs, since every column could have nonzero coefficients, we need do
  1649. eight 1-D column IDCT. However, for each column, there are at most two nonzero
  1650. coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
  1651. two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
  1652. from a full version:
  1653. A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
  1654. C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
  1655. A. = C4 * (A - C) B. = C4 * (B - D)
  1656. C. = A + C D. = B + D
  1657. E = C4 * (I0 + I4) F = C4 * (I0 - I4)
  1658. G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
  1659. E. = E - G
  1660. G. = E + G
  1661. A.. = F + A. B.. = B. - H
  1662. F. = F - A. H. = B. + H
  1663. R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
  1664. R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
  1665. To:
  1666. A = (C1 * I1) B = (C7 * I1)
  1667. C = 0 D = 0
  1668. A. = C4 * A B. = C4 * B
  1669. C. = A D. = B
  1670. E = C4 * I0 F = E
  1671. G = 0 H = 0
  1672. E. = E
  1673. G. = E
  1674. A.. = E + A. B.. = B.
  1675. F. = E - A. H. = B.
  1676. R0 = E + A R1 = E + A. + B. R3 = E + B R5 = E - A. + B.
  1677. R7 = E - A R2 = E + A. - B. R4 = E - B R6 = F - A. - B.
  1678. ******************************************************************************************/
  1679. #define RowIDCT_3 __asm { \
  1680. \
  1681. __asm movq r7, I(1) /* r7 = I1 */ \
  1682. __asm movq r0, C(1) /* r0 = C1 */ \
  1683. \
  1684. __asm movq r3, C(7) /* r3 = C7 */ \
  1685. __asm pmulhw r0, r7 /* r0 = C1 * I1 - I1 */ \
  1686. \
  1687. __asm pmulhw r3, r7 /* r3 = C7 * I1 = B, D. */ \
  1688. __asm movq r6, I(0) /* r6 = I0 */ \
  1689. \
  1690. __asm movq r4, C(4) /* r4 = C4 */ \
  1691. __asm paddw r0, r7 /* r0 = C1 * I1 = A, C. */ \
  1692. \
  1693. __asm movq r1, r6 /* make a copy of I0 */ \
  1694. __asm pmulhw r6, r4 /* r2 = C4 * I0 - I0 */ \
  1695. \
  1696. __asm movq r2, r0 /* make a copy of A */ \
  1697. __asm movq r5, r3 /* make a copy of B */ \
  1698. \
  1699. __asm pmulhw r2, r4 /* r2 = C4 * A - A */ \
  1700. __asm pmulhw r5, r4 /* r5 = C4 * B - B */ \
  1701. \
  1702. __asm paddw r6, r1 /* r2 = C4 * I0 = E, F */ \
  1703. __asm movq r4, r6 /* r4 = E */ \
  1704. \
  1705. __asm paddw r2, r0 /* r2 = A. */ \
  1706. __asm paddw r5, r3 /* r5 = B. */ \
  1707. \
  1708. __asm movq r7, r6 /* r7 = E */ \
  1709. __asm movq r1, r5 /* r1 = B. */ \
  1710. \
  1711. /* r0 = A */ \
  1712. /* r3 = B */ \
  1713. /* r2 = A. */ \
  1714. /* r5 = B. */ \
  1715. /* r6 = E */ \
  1716. /* r4 = E */ \
  1717. /* r7 = E */ \
  1718. /* r1 = B. */ \
  1719. \
  1720. __asm psubw r6, r2 /* r6 = E - A. */ \
  1721. __asm psubw r4, r3 /* r4 = E - B ----R4 */ \
  1722. \
  1723. __asm psubw r7, r0 /* r7 = E - A ----R7 */ \
  1724. __asm paddw r2, r2 /* r2 = A. + A. */ \
  1725. \
  1726. __asm paddw r3, r3 /* r3 = B + B */ \
  1727. __asm paddw r0, r0 /* r0 = A + A */ \
  1728. \
  1729. __asm paddw r2, r6 /* r2 = E + A. */ \
  1730. __asm paddw r3, r4 /* r3 = E + B ----R3 */ \
  1731. \
  1732. __asm psubw r2, r1 /* r2 = E + A. - B. ----R2 */ \
  1733. __asm psubw r6, r5 /* r6 = E - A. - B. ----R6 */ \
  1734. \
  1735. __asm paddw r1, r1 /* r1 = B. + B. */ \
  1736. __asm paddw r5, r5 /* r5 = B. + B. */ \
  1737. \
  1738. __asm paddw r0, r7 /* r0 = E + A ----R0 */ \
  1739. __asm paddw r1, r2 /* r1 = E + A. + B. -----R1 */ \
  1740. \
  1741. __asm movq I(1), r1 /* save r1 */ \
  1742. __asm paddw r5, r6 /* r5 = E - A. + B. -----R5 */ \
  1743. \
  1744. }
  1745. //End of RowIDCT_3
  1746. #define ColumnIDCT_3 __asm { \
  1747. \
  1748. __asm movq r7, I(1) /* r7 = I1 */ \
  1749. __asm movq r0, C(1) /* r0 = C1 */ \
  1750. \
  1751. __asm movq r3, C(7) /* r3 = C7 */ \
  1752. __asm pmulhw r0, r7 /* r0 = C1 * I1 - I1 */ \
  1753. \
  1754. __asm pmulhw r3, r7 /* r3 = C7 * I1 = B, D. */ \
  1755. __asm movq r6, I(0) /* r6 = I0 */ \
  1756. \
  1757. __asm movq r4, C(4) /* r4 = C4 */ \
  1758. __asm paddw r0, r7 /* r0 = C1 * I1 = A, C. */ \
  1759. \
  1760. __asm movq r1, r6 /* make a copy of I0 */ \
  1761. __asm pmulhw r6, r4 /* r2 = C4 * I0 - I0 */ \
  1762. \
  1763. __asm movq r2, r0 /* make a copy of A */ \
  1764. __asm movq r5, r3 /* make a copy of B */ \
  1765. \
  1766. __asm pmulhw r2, r4 /* r2 = C4 * A - A */ \
  1767. __asm pmulhw r5, r4 /* r5 = C4 * B - B */ \
  1768. \
  1769. __asm paddw r6, r1 /* r2 = C4 * I0 = E, F */ \
  1770. __asm movq r4, r6 /* r4 = E */ \
  1771. \
  1772. __asm paddw r6, Eight /* +8 for shift */ \
  1773. __asm Paddw r4, Eight /* +8 for shift */ \
  1774. \
  1775. __asm paddw r2, r0 /* r2 = A. */ \
  1776. __asm paddw r5, r3 /* r5 = B. */ \
  1777. \
  1778. __asm movq r7, r6 /* r7 = E */ \
  1779. __asm movq r1, r5 /* r1 = B. */ \
  1780. \
  1781. /* r0 = A */ \
  1782. /* r3 = B */ \
  1783. /* r2 = A. */ \
  1784. /* r5 = B. */ \
  1785. /* r6 = E */ \
  1786. /* r4 = E */ \
  1787. /* r7 = E */ \
  1788. /* r1 = B. */ \
  1789. \
  1790. __asm psubw r6, r2 /* r6 = E - A. */ \
  1791. __asm psubw r4, r3 /* r4 = E - B ----R4 */ \
  1792. \
  1793. __asm psubw r7, r0 /* r7 = E - A ----R7 */ \
  1794. __asm paddw r2, r2 /* r2 = A. + A. */ \
  1795. \
  1796. __asm paddw r3, r3 /* r3 = B + B */ \
  1797. __asm paddw r0, r0 /* r0 = A + A */ \
  1798. \
  1799. __asm paddw r2, r6 /* r2 = E + A. */ \
  1800. __asm paddw r3, r4 /* r3 = E + B ----R3 */ \
  1801. \
  1802. __asm psraw r4, 4 /* shift */ \
  1803. __asm movq J(4), r4 /* store R4 at J4 */ \
  1804. \
  1805. __asm psraw r3, 4 /* shift */ \
  1806. __asm movq I(3), r3 /* store R3 at I3 */ \
  1807. \
  1808. __asm psubw r2, r1 /* r2 = E + A. - B. ----R2 */ \
  1809. __asm psubw r6, r5 /* r6 = E - A. - B. ----R6 */ \
  1810. \
  1811. __asm paddw r1, r1 /* r1 = B. + B. */ \
  1812. __asm paddw r5, r5 /* r5 = B. + B. */ \
  1813. \
  1814. __asm paddw r0, r7 /* r0 = E + A ----R0 */ \
  1815. __asm paddw r1, r2 /* r1 = E + A. + B. -----R1 */ \
  1816. \
  1817. __asm psraw r7, 4 /* shift */ \
  1818. __asm psraw r2, 4 /* shift */ \
  1819. \
  1820. __asm psraw r0, 4 /* shift */ \
  1821. __asm psraw r1, 4 /* shift */ \
  1822. \
  1823. __asm movq J(7), r7 /* store R7 to J7 */ \
  1824. __asm movq I(0), r0 /* store R0 to I0 */ \
  1825. \
  1826. __asm movq I(1), r1 /* store R1 to I1 */ \
  1827. __asm movq I(2), r2 /* store R2 to I2 */ \
  1828. \
  1829. __asm movq I(1), r1 /* save r1 */ \
  1830. __asm paddw r5, r6 /* r5 = E - A. + B. -----R5 */ \
  1831. \
  1832. __asm psraw r5, 4 /* shift */ \
  1833. __asm movq J(5), r5 /* store R5 at J5 */ \
  1834. \
  1835. __asm psraw r6, 4 /* shift */ \
  1836. __asm movq J(6), r6 /* store R6 at J6 */ \
  1837. \
  1838. }
  1839. //End of ColumnIDCT_3
  1840. __declspec ( naked ) void MMX_idct3 ( INT16 * input, INT16 * output )
  1841. {
  1842. # define M(I) [ecx + MaskOffset + I*8]
  1843. # define C(I) [ecx + CosineOffset + (I-1)*8]
  1844. # define Eight [ecx + EightOffset]
  1845. # undef Arg
  1846. # define Arg(I) [esp + 16 + I*4]
  1847. # define r0 mm0
  1848. # define r1 mm1
  1849. # define r2 mm2
  1850. # define r3 mm3
  1851. # define r4 mm4
  1852. # define r5 mm5
  1853. # define r6 mm6
  1854. # define r7 mm7
  1855. (void) output;
  1856. (void) input;
  1857. __asm {
  1858. push edx
  1859. push ecx
  1860. push ebx
  1861. // Label:
  1862. mov eax, Arg( 0) ; eax = quantized input
  1863. mov edx, Arg( 1) ; edx = destination (= idct buffer)
  1864. mov ecx, [edx] ; (+1 at least) preload the cache before writing
  1865. mov ebx, [edx+28] ; in case proc doesn't cache on writes
  1866. mov ecx, [edx+56] ; gets all the cache lines
  1867. mov ebx, [edx+84] ; regardless of alignment (beyond 32-bit)
  1868. mov ecx, [edx+112] ; also avoids address contention stalls
  1869. mov ebx, [edx+124]
  1870. lea ecx, idctconstants ;; [0];
  1871. movq r0, [eax] ; r0 = 03 02 01 00
  1872. ;
  1873. pxor r1, r1 ; r1 = 13 12 11 10; all zero
  1874. ;
  1875. movq r2, M(0) ; r2 = __ __ __ FF
  1876. movq r3, r0 ; r3 = 03 02 01 00
  1877. pxor r4, r4
  1878. psrlq r0, 16 ; r0 = __ 03 02 01
  1879. pand r3, r2 ; r3 = __ __ __ 00
  1880. movq r5, r0 ; r5 = __ 03 02 01
  1881. movq r6, r1 ; r6 = 13 12 11 10;all zero
  1882. pand r5, r2 ; r5 = __ __ __ 01
  1883. ;psllq r6, 32 ; r6 = 11 10 __ __
  1884. movq r7, M(3) ; r7 = FF __ __ __
  1885. pxor r0, r5 ; r0 = __ 03 02 __
  1886. pand r7, r6 ; r7 = 11 __ __ __
  1887. por r0, r3 ; r0 = __ 03 02 00
  1888. pxor r6, r7 ; r6 = __ 10 __ __
  1889. por r0, r7 ; r0 = 11 03 02 00 = R0
  1890. movq r7, M(3) ; r7 = FF __ __ __
  1891. movq r3, r4 ; r3 = 07 06 05 04
  1892. movq [edx], r0 ; write R0 = r0
  1893. pand r3, r2 ; r3 = __ __ __ 04
  1894. psllq r3, 16 ; r3 = __ __ 04 __
  1895. pand r7, r1 ; r7 = 13 __ __ __
  1896. por r5, r3 ; r5 = __ __ 04 01
  1897. por r7, r6 ; r7 = 13 10 __ __
  1898. por r7, r5 ; r7 = 13 10 04 01 = R1
  1899. psrlq r4, 16 ; r4 = __ 07 06 05
  1900. movq [edx+16], r7 ; write R1 = r7
  1901. movq [edx+32], r4 ; write R2 = r7
  1902. movq [edx+48], r4 ; write R3 = r7
  1903. movq [edx+8], r4 ; write R4 = r7
  1904. movq [edx+24], r4 ; write R5 = r7
  1905. movq [edx+40], r4 ; write R6 = r7
  1906. movq [edx+56], r4 ; write R7 = r0
  1907. movq [edx+120], r4 ; store R15 = r7
  1908. movq [edx+104], r4 ; write R14 = r6
  1909. movq [edx+88], r4 ; write R13 = r7
  1910. movq [edx+72], r4 ; write R12 = r6
  1911. movq [edx+112], r4 ; write R12 = r6
  1912. movq [edx+96], r4 ; store R10 = r6
  1913. movq [edx+80], r4 ; store R9 = r6
  1914. movq [edx+64], r4 ; store R8 = r7
  1915. ;
  1916. ; 123c ( / 64 coeffs < 2c / coeff)
  1917. # undef M
  1918. ; Donepartial transpose; now do the idct itself.
  1919. # define I( K) [edx + ( K * 16)]
  1920. # define J( K) [edx + ( (K - 4) * 16) + 8]
  1921. RowIDCT_3 ; 33 c
  1922. Transpose ; 19 c
  1923. # undef I
  1924. # undef J
  1925. # define I( K) [edx + ( K * 16) + 64]
  1926. # define J( K) [edx + ( (K - 4) * 16) + 72]
  1927. // RowIDCT ; 46 c
  1928. // Transpose ; 19 c
  1929. # undef I
  1930. # undef J
  1931. # define I( K) [edx + (K * 16)]
  1932. # define J( K) I( K)
  1933. ColumnIDCT_3 ; 44 c
  1934. # undef I
  1935. # undef J
  1936. # define I( K) [edx + (K * 16) + 8]
  1937. # define J( K) I( K)
  1938. ColumnIDCT_3 ; 44 c
  1939. # undef I
  1940. # undef J
  1941. pop ebx
  1942. pop ecx
  1943. pop edx
  1944. ret
  1945. }
  1946. }