wmtidct.c 62 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859
  1. /****************************************************************************
  2. *
  3. * Module Title : wmtidct.c
  4. *
  5. * Description : IDct functions optimized specifically for willamette
  6. * processor
  7. *
  8. * Special Notes:
  9. *
  10. * AUTHOR : YaoWu Xu
  11. *
  12. *****************************************************************************
  13. * Revision History
  14. *
  15. * 1.02 YWX 07-dec-00 Removed code not in use and added push pop ebx
  16. * 1.01 YWX 29/06/00 Added Wmt_IDCT_Dx and Wmt_IDCT10_Dx
  17. * 1.00 YWX 31/05/00 Configuration baseline
  18. *
  19. *****************************************************************************
  20. */
  21. /*******************************************************************************
  22. * Module Constants
  23. *******************************************************************************
  24. */
  25. /* constants for rounding */
  26. __declspec(align(32)) static unsigned int Eight[]=
  27. {
  28. 0x00080008,
  29. 0x00080008,
  30. 0x00080008,
  31. 0x00080008
  32. };
  33. /* cosine constants, cosine ( i * pi / 8 ) */
  34. __declspec(align(32)) static unsigned short WmtIdctConst[7 * 8]=
  35. {
  36. 64277,64277,64277,64277,64277,64277,64277,64277,
  37. 60547,60547,60547,60547,60547,60547,60547,60547,
  38. 54491,54491,54491,54491,54491,54491,54491,54491,
  39. 46341,46341,46341,46341,46341,46341,46341,46341,
  40. 36410,36410,36410,36410,36410,36410,36410,36410,
  41. 25080,25080,25080,25080,25080,25080,25080,25080,
  42. 12785,12785,12785,12785,12785,12785,12785,12785
  43. };
  44. /* Mask constant for dequantization */
  45. __declspec(align(32)) static unsigned short WmtDequantConst[]=
  46. {
  47. 0,65535,65535,0,0,0,0,0, //0x0000 0000 0000 0000 0000 FFFF FFFF 0000
  48. 0,0,0,0,65535,65535,0,0, //0x0000 0000 FFFF FFFF 0000 0000 0000 0000
  49. 65535,65535,65535,0,0,0,0,0,//0x0000 0000 0000 0000 0000 FFFF FFFF FFFF
  50. 0,0,0,65535,0,0,0,0, //0x0000 0000 0000 0000 FFFF 0000 0000 0000
  51. 0,0,0,65535,65535,0,0,0, //0x0000 0000 0000 FFFF FFFF 0000 0000 0000
  52. 65535,0,0,0,0,65535,0,0, //0x0000 0000 FFFF 0000 0000 0000 0000 FFFF
  53. 0,0,65535,65535, 0,0,0,0 //0x0000 0000 0000 0000 FFFF FFFF 0000 0000
  54. };
  55. /*******************************************************************************
  56. * Forward Reference
  57. *******************************************************************************
  58. */
  59. /********************************************************************************
  60. * Description of Inverse DCT algorithm.
  61. ********************************************************************************
  62. *
  63. Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
  64. by unsigned 16-bit quantization table entries.
  65. These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
  66. Result is scaled signed DCT coefficients (abs value < 2^15).
  67. In the data stream, the coefficients are sent in order of increasing
  68. total (horizontal + vertical) frequency. The exact picture is as follows:
  69. 00 01 05 06 16 17 33 34
  70. 02 04 07 15 20 32 35 52
  71. 03 10 14 21 31 36 51 53
  72. 11 13 22 30 37 50 54 65
  73. 12 23 27 40 47 55 64 66
  74. 24 26 41 46 56 63 67 74
  75. 25 42 45 57 62 70 73 75
  76. 43 44 60 61 71 72 76 77
  77. Here the position in the matrix corresponds to the (horiz,vert)
  78. freqency indices and the octal entry in the matrix is the position
  79. of the coefficient in the data stream. Thus the coefficients are sent
  80. in sort of a diagonal "snake".
  81. The dequantization stage "uncurls the snake" and stores the expanded
  82. coefficients in more convenient positions. These are not exactly the
  83. natural positions given above but take into account our implementation
  84. of the idct, which basically requires two one-dimensional idcts and
  85. two transposes.
  86. Transposing the 8x8 matrix above gives
  87. 00 02 03 11 12 24 25 43
  88. 01 04 10 13 23 26 42 44
  89. 05 07 14 22 27 41 45 60
  90. 06 15 21 30 40 46 57 61
  91. 16 20 31 37 47 56 62 71
  92. 17 32 36 50 55 63 70 72
  93. 33 35 51 54 64 67 73 76
  94. 34 52 53 65 66 74 75 77
  95. The idct itself is more interesting. Since the two-dimensional dct
  96. basis functions are products of the one-dimesional dct basis functions,
  97. we can compute an inverse (or forward) dct via two 1-D transforms,
  98. on rows then on columns. To exploit MMX parallelism, we actually do
  99. both operations on columns, interposing a (partial) transpose between
  100. the two 1-D transforms, the first transpose being done by the expansion
  101. described above.
  102. The 8-sample one-dimensional DCT is a standard orthogonal expansion using
  103. the (unnormalized) basis functions
  104. b[k]( i) = cos( pi * k * (2i + 1) / 16);
  105. here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
  106. To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
  107. should be multiplied by 1/2.
  108. The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
  109. in each direction. The (unnormalized) basis functions are
  110. B[k,l]( i, j) = b[k]( i) * b[l]( j);
  111. this time k and l are the horizontal and vertical frequencies,
  112. i and j are the horizontal and vertical spatial coordinates;
  113. all indices vary from 0 ... 7 (as above)
  114. and there are now 4 cases of normalization.
  115. Our 1-D idct expansion uses constants C1 ... C7 given by
  116. (*) Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)
  117. and the following 1-D algorithm transforming I0 ... I7 to R0 ... R7 :
  118. A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
  119. C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
  120. A. = C4 * (A - C) B. = C4 * (B - D)
  121. C. = A + C D. = B + D
  122. E = C4 * (I0 + I4) F = C4 * (I0 - I4)
  123. G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
  124. E. = E - G
  125. G. = E + G
  126. A.. = F + A. B.. = B. - H
  127. F. = F - A. H. = B. + H
  128. R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
  129. R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
  130. This algorithm was also used by Paul Wilkins in his C implementation;
  131. it is due to Vetterli and Lightenberg and may be found in the JPEG
  132. reference book by Pennebaker and Mitchell.
  133. Correctness of the algorithm follows from (*) together with the
  134. addition formulas for sine and cosine:
  135. cos( A + B) = cos( A) * cos( B) - sin( A) * sin( B)
  136. sin( A + B) = sin( A) * cos( B) + cos( A) * sin( B)
  137. Note that this implementation absorbs the difference in normalization
  138. between the 0th and higher frequencies, although the results produced
  139. are actually twice as big as they should be. Since we do this for each
  140. dimension, the 2-D idct results are 4x the desired results. Finally,
  141. taking into account that the dequantization multiplies by 4 as well,
  142. our actual results are 16x too big. We fix this by shifting the final
  143. results right by 4 bits.
  144. High precision version approximates C1 ... C7 to 16 bits.
  145. Since there is not multiply taking one unsigned and one signed,
  146. we have to use the signed multiplay, therefore C1 ... C5 appear to be
  147. negative and multiplies involving them must be adjusted to compensate
  148. for this. C6 and C7 do not require this adjustment since
  149. they are < 1/2 and are correctly treated as positive numbers.
  150. Following macro does Eight 8-sample one-dimensional idcts in parallel.
  151. This is actually not such a difficult program to write once you
  152. make a couple of observations (I of course was unable to make these
  153. observations until I'd half-written a couple of other versions).
  154. 1. Everything is easy once you are done with the multiplies.
  155. This is because, given X and Y in registers, one may easily
  156. calculate X+Y and X-Y using just those 2 registers.
  157. 2. You always need at least 2 extra registers to calculate products,
  158. so storing 2 temporaries is inevitable. C. and D. seem to be
  159. the best candidates.
  160. 3. The products should be calculated in decreasing order of complexity
  161. (which translates into register pressure). Since C1 ... C5 require
  162. adjustment (and C6, C7 do not), we begin by calculating C and D.
  163. ********************************************************************************/
  164. /**************************************************************************************
  165. *
  166. * Macro: Wmt_Column_IDCT
  167. *
  168. * Description: The Macro does 1-D IDct on 8 columns.
  169. *
  170. * Input: None
  171. *
  172. * Output: None
  173. *
  174. * Return: None
  175. *
  176. * Special Note: None
  177. *
  178. * Error: None
  179. *
  180. ***************************************************************************************
  181. */
  182. /*
  183. The major difference between Willamette processor and other IA32 processors is that
  184. all of the simd integer instructions now support the 128 bit xmm registers instead
  185. of 64 bit mmx registers. By using these instructions, we can do 8 1-D coloumn idcts
  186. that takes shorts as input and outputs shorts at once
  187. */
  188. #define Wmt_Column_IDCT __asm { \
  189. \
  190. __asm movdqa xmm2, I(3) /* xmm2 = i3 */ \
  191. __asm movdqa xmm6, C(3) /* xmm6 = c3 */ \
  192. \
  193. __asm movdqa xmm4, xmm2 /* xmm4 = i3 */ \
  194. __asm movdqa xmm7, I(5) /* xmm7 = i5 */ \
  195. \
  196. __asm pmulhw xmm4, xmm6 /* xmm4 = c3 * i3 - i3 */ \
  197. __asm movdqa xmm1, C(5) /* xmm1 = c5 */ \
  198. \
  199. __asm pmulhw xmm6, xmm7 /* xmm6 = c3 * i5 - i5 */ \
  200. __asm movdqa xmm5, xmm1 /* xmm5 = c5 */ \
  201. \
  202. __asm pmulhw xmm1, xmm2 /* xmm1 = c5 * i3 - i3 */ \
  203. __asm movdqa xmm3, I(1) /* xmm3 = i1 */ \
  204. \
  205. __asm pmulhw xmm5, xmm7 /* xmm5 = c5 * i5 - i5 */ \
  206. __asm movdqa xmm0, C(1) /* xmm0 = c1 */ \
  207. \
  208. /* all registers are in use */ \
  209. \
  210. __asm paddw xmm4, xmm2 /* xmm4 = c3 * i3 */ \
  211. __asm paddw xmm6, xmm7 /* xmm6 = c3 * i5 */ \
  212. \
  213. __asm paddw xmm2, xmm1 /* xmm2 = c5 * i3 */ \
  214. __asm movdqa xmm1, I(7) /* xmm1 = i7 */ \
  215. \
  216. __asm paddw xmm7, xmm5 /* xmm7 = c5 * i5 */ \
  217. __asm movdqa xmm5, xmm0 /* xmm5 = c1 */ \
  218. \
  219. __asm pmulhw xmm0, xmm3 /* xmm0 = c1 * i1 - i1 */ \
  220. __asm paddsw xmm4, xmm7 /* xmm4 = c3 * i3 + c5 * i5 = C */ \
  221. \
  222. __asm pmulhw xmm5, xmm1 /* xmm5 = c1 * i7 - i7 */ \
  223. __asm movdqa xmm7, C(7) /* xmm7 = c7 */ \
  224. \
  225. __asm psubsw xmm6, xmm2 /* xmm6 = c3 * i5 - c5 * i3 = D */ \
  226. __asm paddw xmm0, xmm3 /* xmm0 = c1 * i1 */ \
  227. \
  228. __asm pmulhw xmm3, xmm7 /* xmm3 = c7 * i1 */ \
  229. __asm movdqa xmm2, I(2) /* xmm2 = i2 */ \
  230. \
  231. __asm pmulhw xmm7, xmm1 /* xmm7 = c7 * i7 */ \
  232. __asm paddw xmm5, xmm1 /* xmm5 = c1 * i7 */ \
  233. \
  234. __asm movdqa xmm1, xmm2 /* xmm1 = i2 */ \
  235. __asm pmulhw xmm2, C(2) /* xmm2 = i2 * c2 -i2 */ \
  236. \
  237. __asm psubsw xmm3, xmm5 /* xmm3 = c7 * i1 - c1 * i7 = B */ \
  238. __asm movdqa xmm5, I(6) /* xmm5 = i6 */ \
  239. \
  240. __asm paddsw xmm0, xmm7 /* xmm0 = c1 * i1 + c7 * i7 = A */ \
  241. __asm movdqa xmm7, xmm5 /* xmm7 = i6 */ \
  242. \
  243. __asm psubsw xmm0, xmm4 /* xmm0 = A - C */ \
  244. __asm pmulhw xmm5, C(2) /* xmm5 = c2 * i6 - i6 */ \
  245. \
  246. __asm paddw xmm2, xmm1 /* xmm2 = i2 * c2 */ \
  247. __asm pmulhw xmm1, C(6) /* xmm1 = c6 * i2 */ \
  248. \
  249. __asm paddsw xmm4, xmm4 /* xmm4 = C + C */ \
  250. __asm paddsw xmm4, xmm0 /* xmm4 = A + C = C. */ \
  251. \
  252. __asm psubsw xmm3, xmm6 /* xmm3 = B - D */ \
  253. __asm paddw xmm5, xmm7 /* xmm5 = c2 * i6 */ \
  254. \
  255. __asm paddsw xmm6, xmm6 /* xmm6 = D + D */ \
  256. __asm pmulhw xmm7, C(6) /* xmm7 = c6 * i6 */ \
  257. \
  258. __asm paddsw xmm6, xmm3 /* xmm6 = B + D = D. */ \
  259. __asm movdqa I(1), xmm4 /* Save C. at I(1) */ \
  260. \
  261. __asm psubsw xmm1, xmm5 /* xmm1 = c6 * i2 - c2 * i6 = H */ \
  262. __asm movdqa xmm4, C(4) /* xmm4 = c4 */ \
  263. \
  264. __asm movdqa xmm5, xmm3 /* xmm5 = B - D */ \
  265. __asm pmulhw xmm3, xmm4 /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
  266. \
  267. __asm paddsw xmm7, xmm2 /* xmm7 = c2 * i2 + c6 * i6 = G */ \
  268. __asm movdqa I(2), xmm6 /* Save D. at I(2) */ \
  269. \
  270. __asm movdqa xmm2, xmm0 /* xmm2 = A - C */ \
  271. __asm movdqa xmm6, I(0) /* xmm6 = i0 */ \
  272. \
  273. __asm pmulhw xmm0, xmm4 /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */\
  274. __asm paddw xmm5, xmm3 /* xmm5 = c4 * ( B - D ) = B. */ \
  275. \
  276. __asm movdqa xmm3, I(4) /* xmm3 = i4 */ \
  277. __asm psubsw xmm5, xmm1 /* xmm5 = B. - H = B.. */ \
  278. \
  279. __asm paddw xmm2, xmm0 /* xmm2 = c4 * ( A - C) = A. */ \
  280. __asm psubsw xmm6, xmm3 /* xmm6 = i0 - i4 */ \
  281. \
  282. __asm movdqa xmm0, xmm6 /* xmm0 = i0 - i4 */ \
  283. __asm pmulhw xmm6, xmm4 /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
  284. \
  285. __asm paddsw xmm3, xmm3 /* xmm3 = i4 + i4 */ \
  286. __asm paddsw xmm1, xmm1 /* xmm1 = H + H */ \
  287. \
  288. __asm paddsw xmm3, xmm0 /* xmm3 = i0 + i4 */ \
  289. __asm paddsw xmm1, xmm5 /* xmm1 = B. + H = H. */ \
  290. \
  291. __asm pmulhw xmm4, xmm3 /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
  292. __asm paddw xmm6, xmm0 /* xmm6 = c4 * ( i0 - i4 ) */ \
  293. \
  294. __asm psubsw xmm6, xmm2 /* xmm6 = F - A. = F. */ \
  295. __asm paddsw xmm2, xmm2 /* xmm2 = A. + A. */ \
  296. \
  297. __asm movdqa xmm0, I(1) /* Load C. from I(1) */ \
  298. __asm paddsw xmm2, xmm6 /* xmm2 = F + A. = A.. */ \
  299. \
  300. __asm paddw xmm4, xmm3 /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
  301. __asm psubsw xmm2, xmm1 /* xmm2 = A.. - H. = R2 */ \
  302. \
  303. __asm paddsw xmm2, Eight /* Adjust R2 and R1 before shifting */ \
  304. __asm paddsw xmm1, xmm1 /* xmm1 = H. + H. */ \
  305. \
  306. __asm paddsw xmm1, xmm2 /* xmm1 = A.. + H. = R1 */ \
  307. __asm psraw xmm2, 4 /* xmm2 = op2 */ \
  308. \
  309. __asm psubsw xmm4, xmm7 /* xmm4 = E - G = E. */ \
  310. __asm psraw xmm1, 4 /* xmm1 = op1 */ \
  311. \
  312. __asm movdqa xmm3, I(2) /* Load D. from I(2) */ \
  313. __asm paddsw xmm7, xmm7 /* xmm7 = G + G */ \
  314. \
  315. __asm movdqa O(2), xmm2 /* Write out op2 */ \
  316. __asm paddsw xmm7, xmm4 /* xmm7 = E + G = G. */ \
  317. \
  318. __asm movdqa O(1), xmm1 /* Write out op1 */ \
  319. __asm psubsw xmm4, xmm3 /* xmm4 = E. - D. = R4 */ \
  320. \
  321. __asm paddsw xmm4, Eight /* Adjust R4 and R3 before shifting */ \
  322. __asm paddsw xmm3, xmm3 /* xmm3 = D. + D. */ \
  323. \
  324. __asm paddsw xmm3, xmm4 /* xmm3 = E. + D. = R3 */ \
  325. __asm psraw xmm4, 4 /* xmm4 = op4 */ \
  326. \
  327. __asm psubsw xmm6, xmm5 /* xmm6 = F. - B..= R6 */ \
  328. __asm psraw xmm3, 4 /* xmm3 = op3 */ \
  329. \
  330. __asm paddsw xmm6, Eight /* Adjust R6 and R5 before shifting */ \
  331. __asm paddsw xmm5, xmm5 /* xmm5 = B.. + B.. */ \
  332. \
  333. __asm paddsw xmm5, xmm6 /* xmm5 = F. + B.. = R5 */ \
  334. __asm psraw xmm6, 4 /* xmm6 = op6 */ \
  335. \
  336. __asm movdqa O(4), xmm4 /* Write out op4 */ \
  337. __asm psraw xmm5, 4 /* xmm5 = op5 */ \
  338. \
  339. __asm movdqa O(3), xmm3 /* Write out op3 */ \
  340. __asm psubsw xmm7, xmm0 /* xmm7 = G. - C. = R7 */ \
  341. \
  342. __asm paddsw xmm7, Eight /* Adjust R7 and R0 before shifting */ \
  343. __asm paddsw xmm0, xmm0 /* xmm0 = C. + C. */ \
  344. \
  345. __asm paddsw xmm0, xmm7 /* xmm0 = G. + C. */ \
  346. __asm psraw xmm7, 4 /* xmm7 = op7 */ \
  347. \
  348. __asm movdqa O(6), xmm6 /* Write out op6 */ \
  349. __asm psraw xmm0, 4 /* xmm0 = op0 */ \
  350. \
  351. __asm movdqa O(5), xmm5 /* Write out op5 */ \
  352. __asm movdqa O(7), xmm7 /* Write out op7 */ \
  353. \
  354. __asm movdqa O(0), xmm0 /* Write out op0 */ \
  355. \
  356. } /* End of Wmt_Column_IDCT macro */
  357. /**************************************************************************************
  358. *
  359. * Macro: Wmt_Row_IDCT
  360. *
  361. * Description: The Macro does 1-D IDct on 8 columns.
  362. *
  363. * Input: None
  364. *
  365. * Output: None
  366. *
  367. * Return: None
  368. *
  369. * Special Note: None
  370. *
  371. * Error: None
  372. *
  373. ***************************************************************************************
  374. */
  375. /*
  376. The major difference between Willamette processor and other IA32 processors is that
  377. all of the simd integer instructions now support the 128 bit xmm registers instead
  378. of 64 bit mmx registers. By using these instructions, we can do 8 1-D coloumn idcts
  379. that takes shorts as input and outputs shorts at once
  380. */
  381. #define Wmt_Row_IDCT __asm { \
  382. \
  383. __asm movdqa xmm2, I(3) /* xmm2 = i3 */ \
  384. __asm movdqa xmm6, C(3) /* xmm6 = c3 */ \
  385. \
  386. __asm movdqa xmm4, xmm2 /* xmm4 = i3 */ \
  387. __asm movdqa xmm7, I(5) /* xmm7 = i5 */ \
  388. \
  389. __asm pmulhw xmm4, xmm6 /* xmm4 = c3 * i3 - i3 */ \
  390. __asm movdqa xmm1, C(5) /* xmm1 = c5 */ \
  391. \
  392. __asm pmulhw xmm6, xmm7 /* xmm6 = c3 * i5 - i5 */ \
  393. __asm movdqa xmm5, xmm1 /* xmm5 = c5 */ \
  394. \
  395. __asm pmulhw xmm1, xmm2 /* xmm1 = c5 * i3 - i3 */ \
  396. __asm movdqa xmm3, I(1) /* xmm3 = i1 */ \
  397. \
  398. __asm pmulhw xmm5, xmm7 /* xmm5 = c5 * i5 - i5 */ \
  399. __asm movdqa xmm0, C(1) /* xmm0 = c1 */ \
  400. \
  401. /* all registers are in use */ \
  402. \
  403. __asm paddw xmm4, xmm2 /* xmm4 = c3 * i3 */ \
  404. __asm paddw xmm6, xmm7 /* xmm6 = c3 * i5 */ \
  405. \
  406. __asm paddw xmm2, xmm1 /* xmm2 = c5 * i3 */ \
  407. __asm movdqa xmm1, I(7) /* xmm1 = i7 */ \
  408. \
  409. __asm paddw xmm7, xmm5 /* xmm7 = c5 * i5 */ \
  410. __asm movdqa xmm5, xmm0 /* xmm5 = c1 */ \
  411. \
  412. __asm pmulhw xmm0, xmm3 /* xmm0 = c1 * i1 - i1 */ \
  413. __asm paddsw xmm4, xmm7 /* xmm4 = c3 * i3 + c5 * i5 = C */ \
  414. \
  415. __asm pmulhw xmm5, xmm1 /* xmm5 = c1 * i7 - i7 */ \
  416. __asm movdqa xmm7, C(7) /* xmm7 = c7 */ \
  417. \
  418. __asm psubsw xmm6, xmm2 /* xmm6 = c3 * i5 - c5 * i3 = D */ \
  419. __asm paddw xmm0, xmm3 /* xmm0 = c1 * i1 */ \
  420. \
  421. __asm pmulhw xmm3, xmm7 /* xmm3 = c7 * i1 */ \
  422. __asm movdqa xmm2, I(2) /* xmm2 = i2 */ \
  423. \
  424. __asm pmulhw xmm7, xmm1 /* xmm7 = c7 * i7 */ \
  425. __asm paddw xmm5, xmm1 /* xmm5 = c1 * i7 */ \
  426. \
  427. __asm movdqa xmm1, xmm2 /* xmm1 = i2 */ \
  428. __asm pmulhw xmm2, C(2) /* xmm2 = i2 * c2 -i2 */ \
  429. \
  430. __asm psubsw xmm3, xmm5 /* xmm3 = c7 * i1 - c1 * i7 = B */ \
  431. __asm movdqa xmm5, I(6) /* xmm5 = i6 */ \
  432. \
  433. __asm paddsw xmm0, xmm7 /* xmm0 = c1 * i1 + c7 * i7 = A */ \
  434. __asm movdqa xmm7, xmm5 /* xmm7 = i6 */ \
  435. \
  436. __asm psubsw xmm0, xmm4 /* xmm0 = A - C */ \
  437. __asm pmulhw xmm5, C(2) /* xmm5 = c2 * i6 - i6 */ \
  438. \
  439. __asm paddw xmm2, xmm1 /* xmm2 = i2 * c2 */ \
  440. __asm pmulhw xmm1, C(6) /* xmm1 = c6 * i2 */ \
  441. \
  442. __asm paddsw xmm4, xmm4 /* xmm4 = C + C */ \
  443. __asm paddsw xmm4, xmm0 /* xmm4 = A + C = C. */ \
  444. \
  445. __asm psubsw xmm3, xmm6 /* xmm3 = B - D */ \
  446. __asm paddw xmm5, xmm7 /* xmm5 = c2 * i6 */ \
  447. \
  448. __asm paddsw xmm6, xmm6 /* xmm6 = D + D */ \
  449. __asm pmulhw xmm7, C(6) /* xmm7 = c6 * i6 */ \
  450. \
  451. __asm paddsw xmm6, xmm3 /* xmm6 = B + D = D. */ \
  452. __asm movdqa I(1), xmm4 /* Save C. at I(1) */ \
  453. \
  454. __asm psubsw xmm1, xmm5 /* xmm1 = c6 * i2 - c2 * i6 = H */ \
  455. __asm movdqa xmm4, C(4) /* xmm4 = c4 */ \
  456. \
  457. __asm movdqa xmm5, xmm3 /* xmm5 = B - D */ \
  458. __asm pmulhw xmm3, xmm4 /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
  459. \
  460. __asm paddsw xmm7, xmm2 /* xmm7 = c2 * i2 + c6 * i6 = G */ \
  461. __asm movdqa I(2), xmm6 /* Save D. at I(2) */ \
  462. \
  463. __asm movdqa xmm2, xmm0 /* xmm2 = A - C */ \
  464. __asm movdqa xmm6, I(0) /* xmm6 = i0 */ \
  465. \
  466. __asm pmulhw xmm0, xmm4 /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
  467. __asm paddw xmm5, xmm3 /* xmm5 = c4 * ( B - D ) = B. */ \
  468. \
  469. __asm movdqa xmm3, I(4) /* xmm3 = i4 */ \
  470. __asm psubsw xmm5, xmm1 /* xmm5 = B. - H = B.. */ \
  471. \
  472. __asm paddw xmm2, xmm0 /* xmm2 = c4 * ( A - C) = A. */ \
  473. __asm psubsw xmm6, xmm3 /* xmm6 = i0 - i4 */ \
  474. \
  475. __asm movdqa xmm0, xmm6 /* xmm0 = i0 - i4 */ \
  476. __asm pmulhw xmm6, xmm4 /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \
  477. \
  478. __asm paddsw xmm3, xmm3 /* xmm3 = i4 + i4 */ \
  479. __asm paddsw xmm1, xmm1 /* xmm1 = H + H */ \
  480. \
  481. __asm paddsw xmm3, xmm0 /* xmm3 = i0 + i4 */ \
  482. __asm paddsw xmm1, xmm5 /* xmm1 = B. + H = H. */ \
  483. \
  484. __asm pmulhw xmm4, xmm3 /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
  485. __asm paddw xmm6, xmm0 /* xmm6 = c4 * ( i0 - i4 ) */ \
  486. \
  487. __asm psubsw xmm6, xmm2 /* xmm6 = F - A. = F. */ \
  488. __asm paddsw xmm2, xmm2 /* xmm2 = A. + A. */ \
  489. \
  490. __asm movdqa xmm0, I(1) /* Load C. from I(1) */ \
  491. __asm paddsw xmm2, xmm6 /* xmm2 = F + A. = A.. */ \
  492. \
  493. __asm paddw xmm4, xmm3 /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
  494. __asm psubsw xmm2, xmm1 /* xmm2 = A.. - H. = R2 */ \
  495. \
  496. __asm paddsw xmm1, xmm1 /* xmm1 = H. + H. */ \
  497. __asm paddsw xmm1, xmm2 /* xmm1 = A.. + H. = R1 */ \
  498. \
  499. __asm psubsw xmm4, xmm7 /* xmm4 = E - G = E. */ \
  500. \
  501. __asm movdqa xmm3, I(2) /* Load D. from I(2) */ \
  502. __asm paddsw xmm7, xmm7 /* xmm7 = G + G */ \
  503. \
  504. __asm movdqa I(2), xmm2 /* Write out op2 */ \
  505. __asm paddsw xmm7, xmm4 /* xmm7 = E + G = G. */ \
  506. \
  507. __asm movdqa I(1), xmm1 /* Write out op1 */ \
  508. __asm psubsw xmm4, xmm3 /* xmm4 = E. - D. = R4 */ \
  509. \
  510. __asm paddsw xmm3, xmm3 /* xmm3 = D. + D. */ \
  511. \
  512. __asm paddsw xmm3, xmm4 /* xmm3 = E. + D. = R3 */ \
  513. \
  514. __asm psubsw xmm6, xmm5 /* xmm6 = F. - B..= R6 */ \
  515. \
  516. __asm paddsw xmm5, xmm5 /* xmm5 = B.. + B.. */ \
  517. \
  518. __asm paddsw xmm5, xmm6 /* xmm5 = F. + B.. = R5 */ \
  519. \
  520. __asm movdqa I(4), xmm4 /* Write out op4 */ \
  521. \
  522. __asm movdqa I(3), xmm3 /* Write out op3 */ \
  523. __asm psubsw xmm7, xmm0 /* xmm7 = G. - C. = R7 */ \
  524. \
  525. __asm paddsw xmm0, xmm0 /* xmm0 = C. + C. */ \
  526. \
  527. __asm paddsw xmm0, xmm7 /* xmm0 = G. + C. */ \
  528. \
  529. __asm movdqa I(6), xmm6 /* Write out op6 */ \
  530. \
  531. __asm movdqa I(5), xmm5 /* Write out op5 */ \
  532. __asm movdqa I(7), xmm7 /* Write out op7 */ \
  533. \
  534. __asm movdqa I(0), xmm0 /* Write out op0 */ \
  535. \
  536. } /* End of Wmt_Row_IDCT macro */
  537. /**************************************************************************************
  538. *
  539. * Macro: Transpose
  540. *
  541. * Description: The Macro does 8x8 transpose
  542. *
  543. * Input: None
  544. *
  545. * Output: None
  546. *
  547. * Return: None
  548. *
  549. * Special Note: None
  550. *
  551. * Error: None
  552. *
  553. ***************************************************************************************
  554. */
  555. #define Transpose __asm { \
  556. \
  557. __asm movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */ \
  558. __asm movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */ \
  559. \
  560. __asm movdqa xmm5, xmm4 /* make a copy */ \
  561. __asm punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */ \
  562. \
  563. __asm punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */ \
  564. __asm movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */ \
  565. \
  566. __asm movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */ \
  567. __asm movdqa xmm7, xmm6 /* make a copy */ \
  568. \
  569. __asm punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */ \
  570. __asm punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */ \
  571. \
  572. __asm movdqa xmm3, xmm4 /* make a copy */ \
  573. __asm punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */ \
  574. \
  575. __asm punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */ \
  576. __asm movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */ \
  577. /* Free xmm6 */ \
  578. __asm movdqa xmm6, xmm5 /* make a copy */ \
  579. __asm punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */ \
  580. \
  581. __asm punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */ \
  582. __asm movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */ \
  583. /* Free xmm7 */ \
  584. __asm movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */ \
  585. __asm movdqa xmm7, xmm0 /* make a copy */ \
  586. \
  587. __asm punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */ \
  588. __asm punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */ \
  589. /* Free xmm1 */ \
  590. __asm movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */ \
  591. __asm movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */ \
  592. \
  593. __asm movdqa xmm1, xmm2 /* make a copy */ \
  594. __asm punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */ \
  595. \
  596. __asm punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */ \
  597. __asm movdqa xmm3, xmm0 /* make a copy */ \
  598. \
  599. __asm punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */ \
  600. __asm punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */ \
  601. /* Free xmm2 */ \
  602. __asm movdqa xmm2, xmm7 /* make a copy */ \
  603. __asm punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */ \
  604. \
  605. __asm punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */ \
  606. __asm movdqa xmm1, xmm0 /* make a copy */ \
  607. \
  608. __asm punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */ \
  609. __asm punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */ \
  610. \
  611. __asm movdqa I(0), xmm0 /* save I(0) */ \
  612. __asm movdqa I(1), xmm1 /* save I(1) */ \
  613. \
  614. __asm movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */ \
  615. __asm movdqa xmm1, xmm3 /* make a copy */ \
  616. \
  617. __asm punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */ \
  618. __asm punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */ \
  619. \
  620. __asm movdqa xmm4, xmm2 /* make a copy */ \
  621. __asm punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */ \
  622. \
  623. __asm punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */ \
  624. __asm movdqa I(2), xmm1 /* save I(2) */ \
  625. \
  626. __asm movdqa I(3), xmm3 /* save I(3) */ \
  627. __asm movdqa I(4), xmm4 /* save I(4) */ \
  628. \
  629. __asm movdqa I(5), xmm2 /* save I(5) */ \
  630. __asm movdqa xmm5, xmm7 /* make a copy */ \
  631. \
  632. __asm punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */ \
  633. __asm punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */ \
  634. \
  635. __asm movdqa I(6), xmm5 /* save I(6) */ \
  636. __asm movdqa I(7), xmm7 /* save I(7) */ \
  637. \
  638. }/* End of Transpose Macro */
  639. /**************************************************************************************
  640. *
  641. * Macro: Wmt_Dequant
  642. *
  643. * Description: The Macro does dequantzation and reorder the coefficents to avoid
  644. * the first transpose before Wmt_Row_IDCT
  645. *
  646. * Input: [eax], quantized input,
  647. * [ebx], quantizaiton table,
  648. *
  649. * Output: [eax]
  650. *
  651. * Return: None
  652. *
  653. * Special Note: None
  654. *
  655. * Error: None
  656. *
  657. ***************************************************************************************
  658. */
  659. #define Wmt_Dequant __asm { \
  660. __asm lea ecx, WmtDequantConst \
  661. __asm movdqa xmm0, [eax] \
  662. \
  663. __asm pmullw xmm0, [ebx] /* xmm0 = 07 06 05 04 03 02 01 00 */ \
  664. __asm movdqa xmm1, [eax + 16] \
  665. \
  666. __asm pmullw xmm1, [ebx + 16] /* xmm1 = 17 16 15 14 13 12 11 10 */ \
  667. __asm pshuflw xmm3, xmm0, 078h /* xmm3 = 07 06 05 04 01 03 02 00 */ \
  668. \
  669. __asm movdqa xmm2, xmm1 /* xmm2 = 17 16 15 14 13 12 11 10 */ \
  670. __asm movdqa xmm7, [ecx] /* xmm7 = -- -- -- -- -- FF FF -- */ \
  671. \
  672. __asm movdqa xmm4, [eax + 32] \
  673. __asm movdqa xmm5, [eax + 64] \
  674. \
  675. __asm pmullw xmm4, [ebx + 32] /* xmm4 = 27 26 25 24 23 22 21 20 */ \
  676. __asm pmullw xmm5, [ebx + 64] /* xmm5 = 47 46 45 44 43 42 41 40 */ \
  677. \
  678. __asm movdqa xmm6, [ecx + 16] /* xmm6 = -- -- FF FF -- -- -- -- */ \
  679. __asm pand xmm7, xmm2 /* xmm7 = -- -- -- -- -- 12 11 -- */ \
  680. \
  681. __asm pand xmm6, xmm4 /* xmm6 = -- -- 25 24 -- -- -- -- */ \
  682. __asm pxor xmm2, xmm7 /* xmm2 = 17 16 15 14 13 -- -- 10 */ \
  683. \
  684. __asm pxor xmm4, xmm6 /* xmm4 = 27 26 -- -- 23 22 21 20 */ \
  685. __asm pslldq xmm7, 4 /* xmm7 = -- -- -- 12 11 -- -- -- */ \
  686. \
  687. __asm pslldq xmm6, 2 /* xmm6 = -- 25 24 -- -- -- -- -- */ \
  688. __asm por xmm7, xmm6 /* xmm7 = -- 25 24 12 11 -- -- -- */ \
  689. \
  690. __asm movdqa xmm0, [ecx + 32] /* xmm0 = -- -- -- -- -- FF FF FF */ \
  691. __asm movdqa xmm6, [ecx + 48] /* xmm6 = -- -- -- -- FF -- -- -- */ \
  692. \
  693. __asm pand xmm0, xmm3 /* xmm0 = -- -- -- -- -- 03 02 00 */ \
  694. __asm pand xmm6, xmm5 /* xmm6 = -- -- -- -- 43 -- -- -- */ \
  695. \
  696. __asm pxor xmm3, xmm0 /* xmm3 = 07 06 05 04 01 -- -- -- */ \
  697. __asm pxor xmm5, xmm6 /* xmm5 = 47 46 45 44 -- 42 41 40 */ \
  698. \
  699. __asm por xmm0, xmm7 /* xmm0 = -- 25 24 12 11 03 02 00 */ \
  700. __asm pslldq xmm6, 8 /* xmm6 = 43 -- -- -- -- -- -- -- */ \
  701. \
  702. __asm por xmm0, xmm6 /* O0 =xmm0 = 43 25 24 12 11 03 02 00 */ \
  703. /* 02345 in use */ \
  704. \
  705. __asm movdqa xmm1, [ecx + 64 ] /* xmm1 = -- -- -- FF FF -- -- -- */ \
  706. __asm pshuflw xmm5, xmm5, 0B4h /* xmm5 = 47 46 45 44 42 -- 41 40 */ \
  707. \
  708. __asm movdqa xmm7, xmm1 /* xmm7 = -- -- -- FF FF -- -- -- */ \
  709. __asm movdqa xmm6, xmm1 /* xmm6 = -- -- -- FF FF -- -- -- */ \
  710. \
  711. __asm movdqa [eax], xmm0 /* write 43 25 24 12 11 03 02 00 */ \
  712. __asm pshufhw xmm4, xmm4, 0C2h /* xmm4 = 27 -- -- 26 23 22 21 20 */ \
  713. \
  714. __asm pand xmm7, xmm4 /* xmm7 = -- -- -- 26 23 -- -- -- */ \
  715. __asm pand xmm1, xmm5 /* xmm1 = -- -- -- 44 42 -- -- -- */ \
  716. \
  717. __asm pxor xmm4, xmm7 /* xmm4 = 27 -- -- -- -- 22 21 20 */ \
  718. __asm pxor xmm5, xmm1 /* xmm5 = 47 46 45 -- -- -- 41 40 */ \
  719. \
  720. __asm pshuflw xmm2, xmm2, 0C6h /* xmm2 = 17 16 15 14 13 10 -- -- */ \
  721. __asm movdqa xmm0, xmm6 /* xmm0 = -- -- -- FF FF -- -- -- */ \
  722. \
  723. __asm pslldq xmm7, 2 /* xmm7 = -- -- 26 23 -- -- -- -- */ \
  724. __asm pslldq xmm1, 6 /* xmm1 = 44 42 -- -- -- -- -- -- */ \
  725. \
  726. __asm psrldq xmm0, 2 /* xmm0 = -- -- -- -- FF FF -- -- */ \
  727. __asm pand xmm6, xmm3 /* xmm6 = -- -- -- 04 01 -- -- -- */ \
  728. \
  729. __asm pand xmm0, xmm2 /* xmm0 = -- -- -- -- 13 10 -- -- */ \
  730. __asm pxor xmm3, xmm6 /* xmm3 = 07 06 05 -- -- -- -- -- */ \
  731. \
  732. __asm pxor xmm2, xmm0 /* xmm2 = 17 16 15 14 -- -- -- -- */ \
  733. __asm psrldq xmm6, 6 /* xmm0 = -- -- -- -- -- -- 04 01 */ \
  734. \
  735. __asm por xmm1, xmm7 /* xmm1 = 44 42 26 23 -- -- -- -- */ \
  736. __asm por xmm0, xmm6 /* xmm1 = -- -- -- -- 13 10 04 01 */ \
  737. /* 12345 in use */ \
  738. __asm por xmm1, xmm0 /* o1 =xmm1 = 44 42 26 23 13 10 04 01 */ \
  739. __asm pshuflw xmm4, xmm4, 093h /* xmm4 = 27 -- -- -- 22 21 20 -- */ \
  740. \
  741. __asm pshufhw xmm4, xmm4, 093h /* xmm4 = -- -- -- 27 22 21 20 -- */ \
  742. __asm movdqa [eax + 16], xmm1 /* write 44 42 26 23 13 10 04 01 */ \
  743. \
  744. __asm pshufhw xmm3, xmm3, 0D2h /* xmm3 = 07 05 -- 06 -- -- -- -- */ \
  745. __asm movdqa xmm0, [ecx + 64] /* xmm0 = -- -- -- FF FF -- -- -- */ \
  746. \
  747. __asm pand xmm0, xmm3 /* xmm0 = -- -- -- 06 -- -- -- -- */ \
  748. __asm psrldq xmm3, 12 /* xmm3 = -- -- -- -- -- -- 07 05 */ \
  749. \
  750. __asm psrldq xmm0, 8 /* xmm0 = -- -- -- -- -- -- -- 06 */ \
  751. \
  752. __asm movdqa xmm6, [ecx + 64] /* xmm6 = -- -- -- FF FF -- -- -- */ \
  753. __asm movdqa xmm7, [ecx + 96] /* xmm7 = -- -- -- -- FF FF -- -- */ \
  754. \
  755. __asm pand xmm6, xmm4 /* xmm6 = -- -- -- 27 22 -- -- -- */ \
  756. __asm pxor xmm4, xmm6 /* xmm4 = -- -- -- -- -- 21 20 -- */ \
  757. \
  758. __asm por xmm3, xmm6 /* xmm3 = -- -- -- 27 22 -- 07 05 */ \
  759. __asm pand xmm7, xmm4 /* xmm7 = -- -- -- -- -- 21 -- -- */ \
  760. \
  761. __asm por xmm0, xmm7 /* xmm0 = -- -- -- -- -- 21 -- 06 */ \
  762. __asm pxor xmm4, xmm7 /* xmm4 = -- -- -- -- -- -- 20 -- */ \
  763. \
  764. __asm movdqa xmm6, [ecx + 16 ] /* xmm6 = -- -- FF FF -- -- -- -- */ \
  765. __asm movdqa xmm1, [ecx + 64 ] /* xmm1 = -- -- -- FF FF -- -- -- */ \
  766. \
  767. __asm pand xmm6, xmm2 /* xmm6 = -- -- 15 14 -- -- -- -- */ \
  768. __asm pand xmm1, xmm6 /* xmm1 = -- -- -- 14 -- -- -- -- */ \
  769. \
  770. __asm pxor xmm2, xmm6 /* xmm2 = 17 16 -- -- -- -- -- -- */ \
  771. __asm pxor xmm6, xmm1 /* xmm6 = -- -- 15 -- -- -- -- -- */ \
  772. \
  773. __asm psrldq xmm1, 4 /* xmm1 = -- -- -- -- -- 14 -- -- */ \
  774. \
  775. __asm psrldq xmm6, 8 /* xmm6 = -- -- -- -- -- -- 15 -- */ \
  776. __asm por xmm3, xmm1 /* xmm3 = -- -- -- 27 22 14 07 05 */ \
  777. \
  778. __asm por xmm0, xmm6 /* xmm0 = -- -- -- -- -- 21 15 06 */ \
  779. __asm pshufhw xmm5, xmm5, 0E1h /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \
  780. \
  781. __asm movdqa xmm1, [ecx + 64] /* xmm1 = -- -- -- FF FF -- -- -- */ \
  782. __asm pshuflw xmm5, xmm5, 072h /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \
  783. \
  784. __asm movdqa xmm6, xmm1 /* xmm6 = -- -- -- FF FF -- -- -- */ \
  785. __asm pand xmm1, xmm5 /* xmm1 = -- -- -- 45 41 -- -- -- */ \
  786. \
  787. __asm pxor xmm5, xmm1 /* xmm5 = 47 46 -- -- -- -- 40 -- */ \
  788. __asm pslldq xmm1, 4 /* xmm1 = -- 45 41 -- -- -- -- -- */ \
  789. \
  790. __asm pshufd xmm5, xmm5, 09Ch /* xmm5 = -- -- -- -- 47 46 40 -- */ \
  791. __asm por xmm3, xmm1 /* xmm3 = -- 45 41 27 22 14 07 05 */ \
  792. \
  793. __asm movdqa xmm1, [eax + 96] /* xmm1 = 67 66 65 64 63 62 61 60 */ \
  794. __asm pmullw xmm1, [ebx + 96] \
  795. \
  796. __asm movdqa xmm7, [ecx] /* xmm7 = -- -- -- -- -- FF FF -- */ \
  797. \
  798. __asm psrldq xmm6, 8 /* xmm6 = -- -- -- -- -- -- -- FF */ \
  799. __asm pand xmm7, xmm5 /* xmm7 = -- -- -- -- -- 46 40 -- */ \
  800. \
  801. __asm pand xmm6, xmm1 /* xmm6 = -- -- -- -- -- -- -- 60 */ \
  802. __asm pxor xmm5, xmm7 /* xmm5 = -- -- -- -- 47 -- -- -- */ \
  803. \
  804. __asm pxor xmm1, xmm6 /* xmm1 = 67 66 65 64 63 62 61 -- */ \
  805. __asm pslldq xmm5, 2 /* xmm5 = -- -- -- 47 -- -- -- -- */ \
  806. \
  807. __asm pslldq xmm6, 14 /* xmm6 = 60 -- -- -- -- -- -- -- */ \
  808. __asm por xmm4, xmm5 /* xmm4 = -- -- -- 47 -- -- 20 -- */ \
  809. \
  810. __asm por xmm3, xmm6 /* O2 = xmm3= 60 45 41 27 22 14 07 05 */ \
  811. __asm pslldq xmm7, 6 /* xmm7 = -- -- 46 40 -- -- -- -- */ \
  812. \
  813. __asm movdqa [eax+32], xmm3 /* write 60 45 41 27 22 14 07 05 */ \
  814. __asm por xmm0, xmm7 /* xmm0 = -- -- 46 40 -- 21 15 06 */ \
  815. /* 0, 1, 2, 4 in use */ \
  816. __asm movdqa xmm3, [eax + 48] /* xmm3 = 37 36 35 34 33 32 31 30 */ \
  817. __asm movdqa xmm5, [eax + 80] /* xmm5 = 57 56 55 54 53 52 51 50 */ \
  818. \
  819. __asm pmullw xmm3, [ebx + 48] \
  820. __asm pmullw xmm5, [ebx + 80] \
  821. \
  822. __asm movdqa xmm6, [ecx + 64] /* xmm6 = -- -- -- FF FF -- -- -- */ \
  823. __asm movdqa xmm7, [ecx + 64] /* xmm7 = -- -- -- FF FF -- -- -- */ \
  824. \
  825. __asm psrldq xmm6, 8 /* xmm6 = -- -- -- -- -- -- -- FF */ \
  826. __asm pslldq xmm7, 8 /* xmm7 = FF -- -- -- -- -- -- -- */ \
  827. \
  828. __asm pand xmm6, xmm3 /* xmm6 = -- -- -- -- -- -- -- 30 */ \
  829. __asm pand xmm7, xmm5 /* xmm7 = 57 -- -- -- -- -- -- -- */ \
  830. \
  831. __asm pxor xmm3, xmm6 /* xmm3 = 37 36 35 34 33 32 31 -- */ \
  832. __asm pxor xmm5, xmm7 /* xmm5 = __ 56 55 54 53 52 51 50 */ \
  833. \
  834. __asm pslldq xmm6, 6 /* xmm6 = -- -- -- -- 30 -- -- -- */ \
  835. __asm psrldq xmm7, 2 /* xmm7 = -- 57 -- -- -- -- -- -- */ \
  836. \
  837. __asm por xmm6, xmm7 /* xmm6 = -- 57 -- -- 30 -- -- -- */ \
  838. __asm movdqa xmm7, [ecx] /* xmm7 = -- -- -- -- -- FF FF -- */ \
  839. \
  840. __asm por xmm0, xmm6 /* xmm0 = -- 57 46 40 30 21 15 06 */ \
  841. __asm psrldq xmm7, 2 /* xmm7 = -- -- -- -- -- -- FF FF */ \
  842. \
  843. __asm movdqa xmm6, xmm2 /* xmm6 = 17 16 -- -- -- -- -- -- */ \
  844. __asm pand xmm7, xmm1 /* xmm7 = -- -- -- -- -- -- 61 -- */ \
  845. \
  846. __asm pslldq xmm6, 2 /* xmm6 = 16 -- -- -- -- -- -- -- */ \
  847. __asm psrldq xmm2, 14 /* xmm2 = -- -- -- -- -- -- -- 17 */ \
  848. \
  849. __asm pxor xmm1, xmm7 /* xmm1 = 67 66 65 64 63 62 -- -- */ \
  850. __asm pslldq xmm7, 12 /* xmm7 = 61 -- -- -- -- -- -- -- */ \
  851. \
  852. __asm psrldq xmm6, 14 /* xmm6 = -- -- -- -- -- -- -- 16 */ \
  853. __asm por xmm4, xmm6 /* xmm4 = -- -- -- 47 -- -- 20 16 */ \
  854. \
  855. __asm por xmm0, xmm7 /* xmm0 = 61 57 46 40 30 21 15 06 */ \
  856. __asm movdqa xmm6, [ecx] /* xmm6 = -- -- -- -- -- FF FF -- */ \
  857. \
  858. __asm psrldq xmm6, 2 /* xmm6 = -- -- -- -- -- -- FF FF */ \
  859. __asm movdqa [eax+48], xmm0 /* write 61 57 46 40 30 21 15 06 */ \
  860. /* 1, 2, 3, 4, 5 in use */\
  861. __asm movdqa xmm0, [ecx] /* xmm0 = -- -- -- -- -- FF FF -- */ \
  862. __asm pand xmm6, xmm3 /* xmm6 = -- -- -- -- -- -- 31 -- */ \
  863. \
  864. __asm movdqa xmm7, xmm3 /* xmm7 = 37 36 35 34 33 32 31 -- */ \
  865. __asm pxor xmm3, xmm6 /* xmm3 = 37 36 35 34 33 32 -- -- */ \
  866. \
  867. __asm pslldq xmm3, 2 /* xmm3 = 36 35 34 33 32 -- -- -- */ \
  868. __asm pand xmm0, xmm1 /* xmm0 = -- -- -- -- -- 62 -- -- */ \
  869. \
  870. __asm psrldq xmm7, 14 /* xmm7 = -- -- -- -- -- -- -- 37 */ \
  871. __asm pxor xmm1, xmm0 /* xmm1 = 67 66 65 64 63 -- -- -- */ \
  872. \
  873. __asm por xmm6, xmm7 /* xmm6 = -- -- -- -- -- -- 31 37 */ \
  874. __asm movdqa xmm7, [ecx + 64] /* xmm7 = -- -- -- FF FF -- -- -- */ \
  875. \
  876. __asm pshuflw xmm6, xmm6, 01Eh /* xmm6 = -- -- -- -- 37 31 -- -- */ \
  877. __asm pslldq xmm7, 6 /* xmm7 = FF FF -- -- -- -- -- -- */ \
  878. \
  879. __asm por xmm4, xmm6 /* xmm4 = -- -- -- 47 37 31 20 16 */ \
  880. __asm pand xmm7, xmm5 /* xmm7 = -- 56 -- -- -- -- -- -- */ \
  881. \
  882. __asm pslldq xmm0, 8 /* xmm0 = -- 62 -- -- -- -- -- -- */ \
  883. __asm pxor xmm5, xmm7 /* xmm5 = -- -- 55 54 53 52 51 50 */ \
  884. \
  885. __asm psrldq xmm7, 2 /* xmm7 = -- -- 56 -- -- -- -- -- */ \
  886. \
  887. __asm pshufhw xmm3, xmm3, 087h /* xmm3 = 35 33 34 36 32 -- -- -- */ \
  888. __asm por xmm0, xmm7 /* xmm0 = -- 62 56 -- -- -- -- -- */ \
  889. \
  890. __asm movdqa xmm7, [eax + 112] /* xmm7 = 77 76 75 74 73 72 71 70 */ \
  891. __asm pmullw xmm7, [ebx + 112] \
  892. \
  893. __asm movdqa xmm6, [ecx + 64] /* xmm6 = -- -- -- FF FF -- -- -- */ \
  894. __asm por xmm4, xmm0 /* xmm4 = -- 62 56 47 37 31 20 16 */ \
  895. \
  896. __asm pshuflw xmm7, xmm7, 0E1h /* xmm7 = 77 76 75 74 73 72 70 71 */ \
  897. __asm psrldq xmm6, 8 /* xmm6 = -- -- -- -- -- -- -- FF */ \
  898. \
  899. __asm movdqa xmm0, [ecx + 64] /* xmm0 = -- -- -- FF FF -- -- -- */ \
  900. __asm pand xmm6, xmm7 /* xmm6 = -- -- -- -- -- -- -- 71 */ \
  901. \
  902. __asm pand xmm0, xmm3 /* xmm0 = -- -- -- 36 32 -- -- -- */ \
  903. __asm pxor xmm7, xmm6 /* xmm7 = 77 76 75 74 73 72 70 -- */ \
  904. \
  905. __asm pxor xmm3, xmm0 /* xmm3 = 35 33 34 -- -- -- -- -- */ \
  906. __asm pslldq xmm6, 14 /* xmm6 = 71 -- -- -- -- -- -- -- */ \
  907. \
  908. __asm psrldq xmm0, 4 /* xmm0 = -- -- -- -- -- 36 32 -- */ \
  909. __asm por xmm4, xmm6 /* xmm4 = 71 62 56 47 37 31 20 16 */ \
  910. \
  911. __asm por xmm2, xmm0 /* xmm2 = -- -- -- -- -- 36 32 17 */ \
  912. __asm movdqa [eax + 64], xmm4 /* write 71 62 56 47 37 31 20 16 */ \
  913. /* 1, 2, 3, 5, 7 in use */ \
  914. __asm movdqa xmm6, [ecx + 80] /* xmm6 = -- -- FF -- -- -- -- FF */ \
  915. __asm pshufhw xmm7, xmm7, 0D2h /* xmm7 = 77 75 74 76 73 72 70 __ */ \
  916. \
  917. __asm movdqa xmm4, [ecx] /* xmm4 = -- -- -- -- -- FF FF -- */ \
  918. __asm movdqa xmm0, [ecx+48] /* xmm0 = -- -- -- -- FF -- -- -- */ \
  919. \
  920. __asm pand xmm6, xmm5 /* xmm6 = -- -- 55 -- -- -- -- 50 */ \
  921. __asm pand xmm4, xmm7 /* xmm4 = -- -- -- -- -- 72 70 -- */ \
  922. \
  923. __asm pand xmm0, xmm1 /* xmm0 = -- -- -- -- 63 -- -- -- */ \
  924. __asm pxor xmm5, xmm6 /* xmm5 = -- -- -- 54 53 52 51 -- */ \
  925. \
  926. __asm pxor xmm7, xmm4 /* xmm7 = 77 75 74 76 73 -- -- -- */ \
  927. __asm pxor xmm1, xmm0 /* xmm1 = 67 66 65 64 -- -- -- -- */ \
  928. \
  929. __asm pshuflw xmm6, xmm6, 02Bh /* xmm6 = -- -- 55 -- 50 -- -- -- */ \
  930. __asm pslldq xmm4, 10 /* xmm4 = 72 20 -- -- -- -- -- -- */ \
  931. \
  932. __asm pshufhw xmm6, xmm6, 0B1h /* xmm6 = -- -- -- 55 50 -- -- -- */ \
  933. __asm pslldq xmm0, 4 /* xmm0 = -- -- 63 -- -- -- -- -- */ \
  934. \
  935. __asm por xmm6, xmm4 /* xmm6 = 72 70 -- 55 50 -- -- -- */ \
  936. __asm por xmm2, xmm0 /* xmm2 = -- -- 63 -- -- 36 32 17 */ \
  937. \
  938. __asm por xmm2, xmm6 /* xmm2 = 72 70 64 55 50 36 32 17 */ \
  939. __asm pshufhw xmm1, xmm1, 0C9h /* xmm1 = 67 64 66 65 -- -- -- -- */ \
  940. \
  941. __asm movdqa xmm6, xmm3 /* xmm6 = 35 33 34 -- -- -- -- -- */ \
  942. __asm movdqa [eax+80], xmm2 /* write 72 70 64 55 50 36 32 17 */ \
  943. \
  944. __asm psrldq xmm6, 12 /* xmm6 = -- -- -- -- -- -- 35 33 */ \
  945. __asm pslldq xmm3, 4 /* xmm3 = 34 -- -- -- -- -- -- -- */ \
  946. \
  947. __asm pshuflw xmm5, xmm5, 04Eh /* xmm5 = -- -- -- 54 51 -- 53 52 */ \
  948. __asm movdqa xmm4, xmm7 /* xmm4 = 77 75 74 76 73 -- -- -- */ \
  949. \
  950. __asm movdqa xmm2, xmm5 /* xmm2 = -- -- -- 54 51 -- 53 52 */ \
  951. __asm psrldq xmm7, 10 /* xmm7 = -- -- -- -- -- 77 75 74 */ \
  952. \
  953. __asm pslldq xmm4, 6 /* xmm4 = 76 73 -- -- -- -- -- -- */ \
  954. __asm pslldq xmm2, 12 /* xmm2 = 53 52 -- -- -- -- -- -- */ \
  955. \
  956. __asm movdqa xmm0, xmm1 /* xmm0 = 67 64 66 65 -- -- -- -- */ \
  957. __asm psrldq xmm1, 12 /* xmm1 = -- -- -- -- -- -- 67 64 */ \
  958. \
  959. __asm psrldq xmm5, 6 /* xmm5 = -- -- -- -- -- -- 54 51 */ \
  960. __asm psrldq xmm3, 14 /* xmm3 = -- -- -- -- -- -- -- 34 */ \
  961. \
  962. __asm pslldq xmm7, 10 /* xmm7 = 77 75 74 -- -- -- -- -- */ \
  963. __asm por xmm4, xmm6 /* xmm4 = 76 73 -- -- -- -- 35 33 */ \
  964. \
  965. __asm psrldq xmm2, 10 /* xmm2 = -- -- -- -- -- 53 52 -- */ \
  966. __asm pslldq xmm0, 4 /* xmm0 = 66 65 -- -- -- -- -- -- */ \
  967. \
  968. __asm pslldq xmm1, 8 /* xmm1 = -- -- 67 64 -- -- -- -- */ \
  969. __asm por xmm3, xmm7 /* xmm3 = 77 75 74 -- -- -- -- 34 */ \
  970. \
  971. __asm psrldq xmm0, 6 /* xmm0 = -- -- -- 66 65 -- -- -- */ \
  972. __asm pslldq xmm5, 4 /* xmm5 = -- -- -- -- 54 51 -- -- */ \
  973. \
  974. __asm por xmm4, xmm1 /* xmm4 = 76 73 67 64 -- -- 35 33 */ \
  975. __asm por xmm3, xmm2 /* xmm3 = 77 75 74 -- -- 53 52 34 */ \
  976. \
  977. __asm por xmm4, xmm5 /* xmm4 = 76 73 67 64 54 51 35 33 */ \
  978. __asm por xmm3, xmm0 /* xmm3 = 77 75 74 66 65 53 52 34 */ \
  979. \
  980. __asm movdqa [eax+96], xmm4 /* write 76 73 67 64 54 51 35 33 */ \
  981. __asm movdqa [eax+112], xmm3 /* write 77 75 74 66 65 53 52 34 */ \
  982. \
  983. }/* end of Wmt_Dequant Macro */
  984. /**************************************************************************************
  985. *
  986. * Macro: Wmt_Dequant_Dx
  987. *
  988. * Description: The Macro does dequantzation
  989. *
  990. * Input: [eax], quantized input,
  991. * [ebx], quantizaiton table,
  992. *
  993. * Output: [eax]
  994. *
  995. * Return: None
  996. *
  997. * Special Note: None
  998. *
  999. * Error: None
  1000. *
  1001. ***************************************************************************************
  1002. */
  1003. #define Wmt_Dequant_Dx __asm { \
  1004. __asm movdqa xmm0, [eax] \
  1005. __asm movdqa xmm1, [eax + 16] \
  1006. \
  1007. __asm pmullw xmm0, [ebx] /* xmm0 = 07 06 05 04 03 02 01 00 */ \
  1008. __asm pmullw xmm1, [ebx + 16] /* xmm1 = 17 16 15 14 13 12 11 10 */ \
  1009. \
  1010. __asm movdqa xmm2, [eax + 32] \
  1011. __asm movdqa xmm3, [eax + 48] /* xmm3 = 37 36 35 34 33 32 31 30 */ \
  1012. \
  1013. __asm pmullw xmm2, [ebx + 32] /* xmm4 = 27 26 25 24 23 22 21 20 */ \
  1014. __asm pmullw xmm3, [ebx + 48] \
  1015. \
  1016. __asm movdqa [edx], xmm0 /* write 43 25 24 12 11 03 02 00 */ \
  1017. __asm movdqa [edx + 16], xmm1 /* write 44 42 26 23 13 10 04 01 */ \
  1018. \
  1019. __asm movdqa xmm4, [eax + 64] \
  1020. __asm movdqa xmm5, [eax + 80] /* xmm5 = 57 56 55 54 53 52 51 50 */ \
  1021. \
  1022. __asm pmullw xmm4, [ebx + 64] /* xmm5 = 47 46 45 44 43 42 41 40 */ \
  1023. __asm pmullw xmm5, [ebx + 80] \
  1024. \
  1025. __asm movdqa [edx+32], xmm2 /* write 60 45 41 27 22 14 07 05 */ \
  1026. __asm movdqa [edx+48], xmm3 /* write 61 57 46 40 30 21 15 06 */ \
  1027. \
  1028. __asm movdqa xmm6, [eax + 96] /* xmm1 = 67 66 65 64 63 62 61 60 */ \
  1029. __asm movdqa xmm7, [eax + 112] /* xmm7 = 77 76 75 74 73 72 71 70 */ \
  1030. \
  1031. __asm pmullw xmm6, [ebx + 96] \
  1032. __asm pmullw xmm7, [ebx + 112] \
  1033. \
  1034. __asm movdqa [edx+64], xmm4 /* write 71 62 56 47 37 31 20 16 */ \
  1035. __asm movdqa [edx+80], xmm5 /* write 72 70 64 55 50 36 32 17 */ \
  1036. \
  1037. __asm movdqa [edx+96], xmm6 /* write 76 73 67 64 54 51 35 33 */ \
  1038. __asm movdqa [edx+112], xmm7 /* write 77 75 74 66 65 53 52 34 */ \
  1039. \
  1040. }/* end of Wmt_Dequant Macro */
  1041. /**************************************************************************************
  1042. *
  1043. * Routine: Wmt_IDct_Dx
  1044. *
  1045. * Description: Perform IDCT on a 8x8 block
  1046. *
  1047. * Input: Pointer to input and output buffer
  1048. *
  1049. * Output: None
  1050. *
  1051. * Return: None
  1052. *
  1053. * Special Note: The input coefficients are in raster order
  1054. *
  1055. * Error: None
  1056. *
  1057. ***************************************************************************************
  1058. */
  1059. void Wmt_IDct_Dx(short *InputData, short *QuantizationTable, short *OutputData)
  1060. {
  1061. __asm
  1062. {
  1063. push ebx
  1064. mov eax, InputData
  1065. mov ebx, QuantizationTable
  1066. mov edx, OutputData
  1067. lea ecx, WmtIdctConst
  1068. Wmt_Dequant_Dx
  1069. #undef I
  1070. #undef O
  1071. #undef C
  1072. #define I(i) [edx + 16 * i ]
  1073. #define O(i) [edx + 16 * i ]
  1074. #define C(i) [ecx + 16 * (i-1) ]
  1075. /* Transpose - absorbed by the Wmt_dequant */
  1076. Wmt_Row_IDCT
  1077. Transpose
  1078. Wmt_Column_IDCT
  1079. pop ebx
  1080. }
  1081. }
  1082. /**************************************************************************************
  1083. ************** Wmt_IDCT10_Dx ******************************************************
  1084. **************************************************************************************
  1085. In IDCT10, we are dealing with only ten Non-Zero coefficients in the 8x8 block.
  1086. In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
  1087. do 1-D row idcts on the first four rows, the rest four rows remain zero anyway.
  1088. After row IDCTs, since every column could have nonzero coefficients, we need do
  1089. eight 1-D column IDCT. However, for each column, there are at most two nonzero
  1090. coefficients, coefficient 0 to coefficient 3. Same for the coefficents for the
  1091. two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
  1092. from a full version:
  1093. A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
  1094. C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
  1095. A. = C4 * (A - C) B. = C4 * (B - D)
  1096. C. = A + C D. = B + D
  1097. E = C4 * (I0 + I4) F = C4 * (I0 - I4)
  1098. G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
  1099. E. = E - G
  1100. G. = E + G
  1101. A.. = F + A. B.. = B. - H
  1102. F. = F - A. H. = B. + H
  1103. R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
  1104. R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
  1105. To:
  1106. A = (C1 * I1) B = (C7 * I1)
  1107. C = (C3 * I3) D = - (C5 * I3)
  1108. A. = C4 * (A - C) B. = C4 * (B - D)
  1109. C. = A + C D. = B + D
  1110. E = C4 * I0 F = E
  1111. G = (C2 * I2) H = (C6 * I2)
  1112. E. = E - G
  1113. G. = E + G
  1114. A.. = F + A. B.. = B. - H
  1115. F. = F - A. H. = B. + H
  1116. R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
  1117. R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
  1118. ******************************************************************************************/
  1119. /**************************************************************************************
  1120. *
  1121. * Macro: Wmt_Column_IDCT10
  1122. *
  1123. * Description: The Macro does 1-D IDct on 8 columns.
  1124. *
  1125. * Input: None
  1126. *
  1127. * Output: None
  1128. *
  1129. * Return: None
  1130. *
  1131. * Special Note: None
  1132. *
  1133. * Error: None
  1134. *
  1135. ***************************************************************************************
  1136. */
  1137. /*
  1138. The major difference between Willamette processor and other IA32 processors is that
  1139. all of the simd integer instructions now support the 128 bit xmm registers instead
  1140. of 64 bit mmx registers. By using these instructions, we can do 8 1-D coloumn idcts
  1141. that takes shorts as input and outputs shorts at once
  1142. */
  1143. #define Wmt_Column_IDCT10 __asm { \
  1144. \
  1145. __asm movdqa xmm2, I(3) /* xmm2 = i3 */ \
  1146. __asm movdqa xmm6, C(3) /* xmm6 = c3 */ \
  1147. \
  1148. __asm movdqa xmm4, xmm2 /* xmm4 = i3 */ \
  1149. __asm pmulhw xmm4, xmm6 /* xmm4 = c3 * i3 - i3 */ \
  1150. \
  1151. __asm movdqa xmm1, C(5) /* xmm1 = c5 */ \
  1152. __asm movdqa xmm5, xmm1 /* xmm5 = c5 */ \
  1153. \
  1154. __asm pmulhw xmm1, xmm2 /* xmm1 = c5 * i3 - i3 */ \
  1155. __asm movdqa xmm3, I(1) /* xmm3 = i1 */ \
  1156. \
  1157. __asm movdqa xmm0, C(1) /* xmm0 = c1 */ \
  1158. __asm paddw xmm4, xmm2 /* xmm4 = c3 * i3 = C */ \
  1159. \
  1160. __asm movdqa xmm7, C(7) /* xmm7 = c7 */ \
  1161. \
  1162. __asm paddw xmm2, xmm1 /* xmm2 = c5 * i3 */ \
  1163. __asm movdqa xmm5, xmm0 /* xmm5 = c1 */ \
  1164. \
  1165. __asm pmulhw xmm0, xmm3 /* xmm0 = c1 * i1 - i1 */ \
  1166. __asm pxor xmm6, xmm6 /* clear xmm6 */ \
  1167. \
  1168. __asm psubsw xmm6, xmm2 /* xmm6 = - c5 * i3 = D */ \
  1169. __asm paddw xmm0, xmm3 /* xmm0 = c1 * i1 = A */ \
  1170. \
  1171. __asm pmulhw xmm3, xmm7 /* xmm3 = c7 * i1 = B */ \
  1172. __asm movdqa xmm2, I(2) /* xmm2 = i2 */ \
  1173. \
  1174. __asm movdqa xmm1, xmm2 /* xmm1 = i2 */ \
  1175. __asm pmulhw xmm2, C(2) /* xmm2 = i2 * c2 -i2 */ \
  1176. \
  1177. __asm psubsw xmm0, xmm4 /* xmm0 = A - C */ \
  1178. \
  1179. __asm paddw xmm2, xmm1 /* xmm2 = i2 * c2 */ \
  1180. __asm pmulhw xmm1, C(6) /* xmm1 = c6 * i2 */ \
  1181. \
  1182. __asm paddsw xmm4, xmm4 /* xmm4 = C + C */ \
  1183. __asm paddsw xmm4, xmm0 /* xmm4 = A + C = C. */ \
  1184. \
  1185. __asm psubsw xmm3, xmm6 /* xmm3 = B - D */ \
  1186. __asm paddsw xmm6, xmm6 /* xmm6 = D + D */ \
  1187. \
  1188. __asm paddsw xmm6, xmm3 /* xmm6 = B + D = D. */ \
  1189. __asm movdqa I(1), xmm4 /* Save C. at I(1) */ \
  1190. \
  1191. __asm movdqa xmm4, C(4) /* xmm4 = c4 */ \
  1192. __asm movdqa xmm5, xmm3 /* xmm5 = B - D */ \
  1193. \
  1194. __asm pmulhw xmm3, xmm4 /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
  1195. \
  1196. __asm movdqa xmm7, xmm2 /* xmm7 = c2 * i2 + c6 * i6 = G */ \
  1197. __asm movdqa I(2), xmm6 /* Save D. at I(2) */ \
  1198. \
  1199. __asm movdqa xmm2, xmm0 /* xmm2 = A - C */ \
  1200. __asm movdqa xmm6, I(0) /* xmm6 = i0 */ \
  1201. \
  1202. __asm pmulhw xmm0, xmm4 /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */\
  1203. __asm paddw xmm5, xmm3 /* xmm5 = c4 * ( B - D ) = B. */ \
  1204. \
  1205. __asm psubsw xmm5, xmm1 /* xmm5 = B. - H = B.. */ \
  1206. __asm paddw xmm2, xmm0 /* xmm2 = c4 * ( A - C) = A. */ \
  1207. \
  1208. __asm movdqa xmm0, xmm6 /* xmm0 = i0 */ \
  1209. __asm pmulhw xmm6, xmm4 /* xmm6 = (c4 - 1) * i0 = E = F */ \
  1210. \
  1211. __asm paddsw xmm1, xmm1 /* xmm1 = H + H */ \
  1212. __asm paddsw xmm1, xmm5 /* xmm1 = B. + H = H. */ \
  1213. \
  1214. __asm paddw xmm6, xmm0 /* xmm6 = c4 * i0 */ \
  1215. __asm movdqa xmm4, xmm6 /* xmm4 = c4 * i0 = E */ \
  1216. \
  1217. __asm psubsw xmm6, xmm2 /* xmm6 = F - A. = F. */ \
  1218. __asm paddsw xmm2, xmm2 /* xmm2 = A. + A. */ \
  1219. \
  1220. __asm movdqa xmm0, I(1) /* Load C. from I(1) */ \
  1221. __asm paddsw xmm2, xmm6 /* xmm2 = F + A. = A.. */ \
  1222. \
  1223. __asm psubsw xmm2, xmm1 /* xmm2 = A.. - H. = R2 */ \
  1224. \
  1225. __asm paddsw xmm2, Eight /* Adjust R2 and R1 before shifting */ \
  1226. __asm paddsw xmm1, xmm1 /* xmm1 = H. + H. */ \
  1227. \
  1228. __asm paddsw xmm1, xmm2 /* xmm1 = A.. + H. = R1 */ \
  1229. __asm psraw xmm2, 4 /* xmm2 = op2 */ \
  1230. \
  1231. __asm psubsw xmm4, xmm7 /* xmm4 = E - G = E. */ \
  1232. __asm psraw xmm1, 4 /* xmm1 = op1 */ \
  1233. \
  1234. __asm movdqa xmm3, I(2) /* Load D. from I(2) */ \
  1235. __asm paddsw xmm7, xmm7 /* xmm7 = G + G */ \
  1236. \
  1237. __asm movdqa O(2), xmm2 /* Write out op2 */ \
  1238. __asm paddsw xmm7, xmm4 /* xmm7 = E + G = G. */ \
  1239. \
  1240. __asm movdqa O(1), xmm1 /* Write out op1 */ \
  1241. __asm psubsw xmm4, xmm3 /* xmm4 = E. - D. = R4 */ \
  1242. \
  1243. __asm paddsw xmm4, Eight /* Adjust R4 and R3 before shifting */ \
  1244. __asm paddsw xmm3, xmm3 /* xmm3 = D. + D. */ \
  1245. \
  1246. __asm paddsw xmm3, xmm4 /* xmm3 = E. + D. = R3 */ \
  1247. __asm psraw xmm4, 4 /* xmm4 = op4 */ \
  1248. \
  1249. __asm psubsw xmm6, xmm5 /* xmm6 = F. - B..= R6 */ \
  1250. __asm psraw xmm3, 4 /* xmm3 = op3 */ \
  1251. \
  1252. __asm paddsw xmm6, Eight /* Adjust R6 and R5 before shifting */ \
  1253. __asm paddsw xmm5, xmm5 /* xmm5 = B.. + B.. */ \
  1254. \
  1255. __asm paddsw xmm5, xmm6 /* xmm5 = F. + B.. = R5 */ \
  1256. __asm psraw xmm6, 4 /* xmm6 = op6 */ \
  1257. \
  1258. __asm movdqa O(4), xmm4 /* Write out op4 */ \
  1259. __asm psraw xmm5, 4 /* xmm5 = op5 */ \
  1260. \
  1261. __asm movdqa O(3), xmm3 /* Write out op3 */ \
  1262. __asm psubsw xmm7, xmm0 /* xmm7 = G. - C. = R7 */ \
  1263. \
  1264. __asm paddsw xmm7, Eight /* Adjust R7 and R0 before shifting */ \
  1265. __asm paddsw xmm0, xmm0 /* xmm0 = C. + C. */ \
  1266. \
  1267. __asm paddsw xmm0, xmm7 /* xmm0 = G. + C. */ \
  1268. __asm psraw xmm7, 4 /* xmm7 = op7 */ \
  1269. \
  1270. __asm movdqa O(6), xmm6 /* Write out op6 */ \
  1271. __asm psraw xmm0, 4 /* xmm0 = op0 */ \
  1272. \
  1273. __asm movdqa O(5), xmm5 /* Write out op5 */ \
  1274. __asm movdqa O(7), xmm7 /* Write out op7 */ \
  1275. \
  1276. __asm movdqa O(0), xmm0 /* Write out op0 */ \
  1277. \
  1278. } /* End of Wmt_Column_IDCT10 macro */
  1279. /**************************************************************************************
  1280. *
  1281. * Macro: Wmt_Row_IDCT10
  1282. *
  1283. * Description: The Macro does 1-D IDct on 8 columns.
  1284. *
  1285. * Input: None
  1286. *
  1287. * Output: None
  1288. *
  1289. * Return: None
  1290. *
  1291. * Special Note: None
  1292. *
  1293. * Error: None
  1294. *
  1295. ***************************************************************************************
  1296. */
  1297. /*
  1298. The major difference between Willamette processor and other IA32 processors is that
  1299. all of the simd integer instructions now support the 128 bit xmm registers instead
  1300. of 64 bit mmx registers. By using these instructions, we can do 8 1-D coloumn idcts
  1301. that takes shorts as input and outputs shorts at once
  1302. */
  1303. #define Wmt_Row_IDCT10 __asm { \
  1304. \
  1305. __asm movdqa xmm2, I(3) /* xmm2 = i3 */ \
  1306. __asm movdqa xmm6, C(3) /* xmm6 = c3 */ \
  1307. \
  1308. __asm movdqa xmm4, xmm2 /* xmm4 = i3 */ \
  1309. __asm pmulhw xmm4, xmm6 /* xmm4 = c3 * i3 - i3 */ \
  1310. \
  1311. __asm movdqa xmm1, C(5) /* xmm1 = c5 */ \
  1312. __asm movdqa xmm5, xmm1 /* xmm5 = c5 */ \
  1313. \
  1314. __asm pmulhw xmm1, xmm2 /* xmm1 = c5 * i3 - i3 */ \
  1315. __asm movdqa xmm3, I(1) /* xmm3 = i1 */ \
  1316. \
  1317. __asm movdqa xmm0, C(1) /* xmm0 = c1 */ \
  1318. __asm paddw xmm4, xmm2 /* xmm4 = c3 * i3 =C */ \
  1319. \
  1320. __asm movdqa xmm7, C(7) /* xmm7 = c7 */ \
  1321. \
  1322. __asm paddw xmm2, xmm1 /* xmm2 = c5 * i3 */ \
  1323. __asm movdqa xmm5, xmm0 /* xmm5 = c1 */ \
  1324. \
  1325. __asm pmulhw xmm0, xmm3 /* xmm0 = c1 * i1 - i1 */ \
  1326. __asm pxor xmm6, xmm6 /* clear xmm6 */ \
  1327. \
  1328. __asm psubsw xmm6, xmm2 /* xmm6 = - c5 * i3 = D */ \
  1329. __asm paddw xmm0, xmm3 /* xmm0 = c1 * i1 = A */ \
  1330. \
  1331. __asm pmulhw xmm3, xmm7 /* xmm3 = c7 * i1 = B */ \
  1332. __asm movdqa xmm2, I(2) /* xmm2 = i2 */ \
  1333. \
  1334. __asm movdqa xmm1, xmm2 /* xmm1 = i2 */ \
  1335. __asm pmulhw xmm2, C(2) /* xmm2 = i2 * c2 -i2 */ \
  1336. \
  1337. __asm psubsw xmm0, xmm4 /* xmm0 = A - C */ \
  1338. \
  1339. __asm paddw xmm2, xmm1 /* xmm2 = i2 * c2 = G */ \
  1340. __asm pmulhw xmm1, C(6) /* xmm1 = c6 * i2 = H */ \
  1341. \
  1342. __asm paddsw xmm4, xmm4 /* xmm4 = C + C */ \
  1343. __asm paddsw xmm4, xmm0 /* xmm4 = A + C = C. */ \
  1344. \
  1345. __asm psubsw xmm3, xmm6 /* xmm3 = B - D */ \
  1346. __asm paddsw xmm6, xmm6 /* xmm6 = D + D */ \
  1347. \
  1348. __asm paddsw xmm6, xmm3 /* xmm6 = B + D = D. */ \
  1349. __asm movdqa I(1), xmm4 /* Save C. at I(1) */ \
  1350. \
  1351. __asm movdqa xmm4, C(4) /* xmm4 = c4 */ \
  1352. \
  1353. __asm movdqa xmm5, xmm3 /* xmm5 = B - D */ \
  1354. __asm pmulhw xmm3, xmm4 /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
  1355. \
  1356. __asm movdqa xmm7, xmm2 /* xmm7 = c2 * i2 = G */ \
  1357. __asm movdqa I(2), xmm6 /* Save D. at I(2) */ \
  1358. \
  1359. __asm movdqa xmm2, xmm0 /* xmm2 = A - C */ \
  1360. __asm movdqa xmm6, I(0) /* xmm6 = i0 */ \
  1361. \
  1362. __asm pmulhw xmm0, xmm4 /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
  1363. __asm paddw xmm5, xmm3 /* xmm5 = c4 * ( B - D ) = B. */ \
  1364. \
  1365. __asm psubsw xmm5, xmm1 /* xmm5 = B. - H = B.. */ \
  1366. __asm paddw xmm2, xmm0 /* xmm2 = c4 * ( A - C) = A. */ \
  1367. \
  1368. __asm movdqa xmm0, xmm6 /* xmm0 = i0 */ \
  1369. __asm pmulhw xmm6, xmm4 /* xmm6 = ( c4 - 1 ) * i0 = E = F */ \
  1370. \
  1371. __asm paddsw xmm1, xmm1 /* xmm1 = H + H */ \
  1372. __asm paddsw xmm1, xmm5 /* xmm1 = B. + H = H. */ \
  1373. \
  1374. __asm paddw xmm6, xmm0 /* xmm6 = c4 * i0 */ \
  1375. __asm movdqa xmm4, xmm6 /* xmm4 = c4 * i0 */ \
  1376. \
  1377. __asm psubsw xmm6, xmm2 /* xmm6 = F - A. = F. */ \
  1378. __asm paddsw xmm2, xmm2 /* xmm2 = A. + A. */ \
  1379. \
  1380. __asm movdqa xmm0, I(1) /* Load C. from I(1) */ \
  1381. __asm paddsw xmm2, xmm6 /* xmm2 = F + A. = A.. */ \
  1382. \
  1383. __asm psubsw xmm2, xmm1 /* xmm2 = A.. - H. = R2 */ \
  1384. \
  1385. __asm paddsw xmm1, xmm1 /* xmm1 = H. + H. */ \
  1386. __asm paddsw xmm1, xmm2 /* xmm1 = A.. + H. = R1 */ \
  1387. \
  1388. __asm psubsw xmm4, xmm7 /* xmm4 = E - G = E. */ \
  1389. \
  1390. __asm movdqa xmm3, I(2) /* Load D. from I(2) */ \
  1391. __asm paddsw xmm7, xmm7 /* xmm7 = G + G */ \
  1392. \
  1393. __asm movdqa I(2), xmm2 /* Write out op2 */ \
  1394. __asm paddsw xmm7, xmm4 /* xmm7 = E + G = G. */ \
  1395. \
  1396. __asm movdqa I(1), xmm1 /* Write out op1 */ \
  1397. __asm psubsw xmm4, xmm3 /* xmm4 = E. - D. = R4 */ \
  1398. \
  1399. __asm paddsw xmm3, xmm3 /* xmm3 = D. + D. */ \
  1400. \
  1401. __asm paddsw xmm3, xmm4 /* xmm3 = E. + D. = R3 */ \
  1402. \
  1403. __asm psubsw xmm6, xmm5 /* xmm6 = F. - B..= R6 */ \
  1404. \
  1405. __asm paddsw xmm5, xmm5 /* xmm5 = B.. + B.. */ \
  1406. \
  1407. __asm paddsw xmm5, xmm6 /* xmm5 = F. + B.. = R5 */ \
  1408. \
  1409. __asm movdqa I(4), xmm4 /* Write out op4 */ \
  1410. \
  1411. __asm movdqa I(3), xmm3 /* Write out op3 */ \
  1412. __asm psubsw xmm7, xmm0 /* xmm7 = G. - C. = R7 */ \
  1413. \
  1414. __asm paddsw xmm0, xmm0 /* xmm0 = C. + C. */ \
  1415. \
  1416. __asm paddsw xmm0, xmm7 /* xmm0 = G. + C. */ \
  1417. \
  1418. __asm movdqa I(6), xmm6 /* Write out op6 */ \
  1419. \
  1420. __asm movdqa I(5), xmm5 /* Write out op5 */ \
  1421. __asm movdqa I(7), xmm7 /* Write out op7 */ \
  1422. \
  1423. __asm movdqa I(0), xmm0 /* Write out op0 */ \
  1424. \
  1425. } /* End of Wmt_Row_IDCT10 macro */
  1426. /**************************************************************************************
  1427. *
  1428. * Macro: Transpose
  1429. *
  1430. * Description: The Macro does 8x8 transpose
  1431. *
  1432. * Input: None
  1433. *
  1434. * Output: None
  1435. *
  1436. * Return: None
  1437. *
  1438. * Special Note: None
  1439. *
  1440. * Error: None
  1441. *
  1442. ***************************************************************************************
  1443. */
  1444. #define Transpose10 __asm { \
  1445. \
  1446. __asm movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */ \
  1447. __asm movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */ \
  1448. \
  1449. __asm movdqa xmm5, xmm4 /* make a copy */ \
  1450. __asm punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */ \
  1451. \
  1452. __asm punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */ \
  1453. __asm movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */ \
  1454. \
  1455. __asm movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */ \
  1456. __asm movdqa xmm7, xmm6 /* make a copy */ \
  1457. \
  1458. __asm punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */ \
  1459. __asm punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */ \
  1460. \
  1461. __asm movdqa xmm3, xmm4 /* make a copy */ \
  1462. __asm punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */ \
  1463. \
  1464. __asm punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */ \
  1465. __asm movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */ \
  1466. /* Free xmm6 */ \
  1467. __asm movdqa xmm6, xmm5 /* make a copy */ \
  1468. __asm punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */ \
  1469. \
  1470. __asm punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */ \
  1471. __asm movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */ \
  1472. /* Free xmm7 */ \
  1473. __asm movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */ \
  1474. __asm movdqa xmm7, xmm0 /* make a copy */ \
  1475. \
  1476. __asm punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */ \
  1477. __asm punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */ \
  1478. /* Free xmm1 */ \
  1479. __asm movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */ \
  1480. __asm movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */ \
  1481. \
  1482. __asm movdqa xmm1, xmm2 /* make a copy */ \
  1483. __asm punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */ \
  1484. \
  1485. __asm punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */ \
  1486. __asm movdqa xmm3, xmm0 /* make a copy */ \
  1487. \
  1488. __asm punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */ \
  1489. __asm punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */ \
  1490. /* Free xmm2 */ \
  1491. __asm movdqa xmm2, xmm7 /* make a copy */ \
  1492. __asm punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */ \
  1493. \
  1494. __asm punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */ \
  1495. __asm movdqa xmm1, xmm0 /* make a copy */ \
  1496. \
  1497. __asm punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */ \
  1498. __asm punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */ \
  1499. \
  1500. __asm movdqa I(0), xmm0 /* save I(0) */ \
  1501. __asm movdqa I(1), xmm1 /* save I(1) */ \
  1502. \
  1503. __asm movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */ \
  1504. __asm movdqa xmm1, xmm3 /* make a copy */ \
  1505. \
  1506. __asm punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */ \
  1507. __asm punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */ \
  1508. \
  1509. __asm movdqa xmm4, xmm2 /* make a copy */ \
  1510. __asm punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */ \
  1511. \
  1512. __asm punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */ \
  1513. __asm movdqa I(2), xmm1 /* save I(2) */ \
  1514. \
  1515. __asm movdqa I(3), xmm3 /* save I(3) */ \
  1516. __asm movdqa I(4), xmm4 /* save I(4) */ \
  1517. \
  1518. __asm movdqa I(5), xmm2 /* save I(5) */ \
  1519. __asm movdqa xmm5, xmm7 /* make a copy */ \
  1520. \
  1521. __asm punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */ \
  1522. __asm punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */ \
  1523. \
  1524. __asm movdqa I(6), xmm5 /* save I(6) */ \
  1525. __asm movdqa I(7), xmm7 /* save I(7) */ \
  1526. \
  1527. }/* End of Transpose10 Macro */
  1528. /**************************************************************************************
  1529. *
  1530. * Macro: Wmt_Dequant10_Dx
  1531. *
  1532. * Description: The Macro does dequantzation
  1533. *
  1534. * Input: [eax], quantized input,
  1535. * [ebx], quantizaiton table,
  1536. *
  1537. * Output: [eax]
  1538. *
  1539. * Return: None
  1540. *
  1541. * Special Note: None
  1542. *
  1543. * Error: None
  1544. *
  1545. ***************************************************************************************
  1546. */
  1547. #define Wmt_Dequant10_Dx __asm { \
  1548. __asm movdqa xmm0, [eax] \
  1549. __asm movdqa xmm1, [eax + 16] \
  1550. \
  1551. __asm pmullw xmm0, [ebx] /* xmm0 = 07 06 05 04 03 02 01 00 */ \
  1552. __asm pmullw xmm1, [ebx + 16] /* xmm1 = 17 16 15 14 13 12 11 10 */ \
  1553. \
  1554. __asm movdqa xmm2, [eax + 32] \
  1555. __asm movdqa xmm3, [eax + 48] /* xmm3 = 37 36 35 34 33 32 31 30 */ \
  1556. \
  1557. __asm pmullw xmm2, [ebx + 32] /* xmm2 = 27 26 25 24 23 22 21 20 */ \
  1558. __asm pmullw xmm3, [ebx + 48] \
  1559. \
  1560. __asm movdqa [edx], xmm0 /* write */ \
  1561. __asm movdqa [edx + 16], xmm1 /* write */ \
  1562. \
  1563. __asm movdqa [edx+32], xmm2 /* write */ \
  1564. __asm movdqa [edx+48], xmm3 /* write */ \
  1565. \
  1566. }/* end of Wmt_Dequant10_Dx Macro */
  1567. /**************************************************************************************
  1568. *
  1569. * Routine: Wmt_IDct10_Dx
  1570. *
  1571. * Description: Perform IDCT on a 8x8 block where only the first 10 coeffs are
  1572. * non-zero coefficients.
  1573. *
  1574. * Input: Pointer to input and output buffer
  1575. *
  1576. * Output: None
  1577. *
  1578. * Return: None
  1579. *
  1580. * Special Note: The input coefficients are in raster order
  1581. *
  1582. * Error: None
  1583. *
  1584. ***************************************************************************************
  1585. */
  1586. void Wmt_IDct10_Dx(short *InputData, short *QuantizationTable, short *OutputData)
  1587. {
  1588. __asm
  1589. {
  1590. push ebx
  1591. mov eax, InputData
  1592. mov ebx, QuantizationTable
  1593. mov edx, OutputData
  1594. lea ecx, WmtIdctConst
  1595. Wmt_Dequant10_Dx
  1596. #define I(i) [edx + 16 * i ]
  1597. #define O(i) [edx + 16 * i ]
  1598. #define C(i) [ecx + 16 * (i-1) ]
  1599. /* Transpose - absorbed by the Wmt_dequant */
  1600. Wmt_Row_IDCT10
  1601. Transpose10
  1602. Wmt_Column_IDCT10
  1603. pop ebx
  1604. }
  1605. }
  1606. /**************************************************************************************
  1607. *
  1608. * Routine: Wmt_IDct1
  1609. *
  1610. * Description: Perform IDCT on a 8x8 block where only the first 1 coeff
  1611. *
  1612. * Input: Pointer to input and output buffer
  1613. *
  1614. * Output: None
  1615. *
  1616. * Return: None
  1617. *
  1618. * Special Note: We only have one coefficient
  1619. *
  1620. * Error: None
  1621. *
  1622. ***************************************************************************************
  1623. */
  1624. void Wmt_idct1 (short * input, short * qtbl, short * output)
  1625. {
  1626. __asm
  1627. {
  1628. mov eax, [input]
  1629. mov edx, 0xf
  1630. movd xmm2, edx
  1631. mov ecx, [qtbl]
  1632. mov edx, [output]
  1633. movq xmm0, QWORD ptr [eax]
  1634. movq xmm1, QWORD ptr [ecx]
  1635. pmullw xmm0, xmm1;
  1636. paddw xmm0, xmm2
  1637. psraw xmm0, 5;
  1638. punpcklwd xmm0, xmm0;
  1639. punpckldq xmm0, xmm0;
  1640. punpcklqdq xmm0, xmm0;
  1641. movdqa xmm1, xmm0
  1642. movdqa [edx], xmm0;
  1643. movdqa [edx+16], xmm1;
  1644. movdqa [edx+32], xmm0;
  1645. movdqa [edx+48], xmm1;
  1646. movdqa [edx+64], xmm0;
  1647. movdqa [edx+80], xmm1;
  1648. movdqa [edx+96], xmm0;
  1649. movdqa [edx+112], xmm1;
  1650. }
  1651. }
  1652. /**************************************************************************************
  1653. ************** Wmt_IDCT3 ******************************************************
  1654. **************************************************************************************
  1655. */
  1656. /**************************************************************************************
  1657. *
  1658. * Routine: Wmt_IDCT3
  1659. *
  1660. * Description: Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
  1661. *
  1662. * Input: Pointer to input and output buffer
  1663. *
  1664. * Output: None
  1665. *
  1666. * Return: None
  1667. *
  1668. * Special Note: Intel Compiler, Please
  1669. *
  1670. * Error: None
  1671. *
  1672. ***************************************************************************************
  1673. */
  1674. /***************************************************************************************
  1675. In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
  1676. In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
  1677. do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
  1678. After row IDCTs, since every column could have nonzero coefficients, we need do
  1679. eight 1-D column IDCT. However, for each column, there are at most two nonzero
  1680. coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
  1681. two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
  1682. from a full version:
  1683. A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
  1684. C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
  1685. A. = C4 * (A - C) B. = C4 * (B - D)
  1686. C. = A + C D. = B + D
  1687. E = C4 * (I0 + I4) F = C4 * (I0 - I4)
  1688. G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
  1689. E. = E - G
  1690. G. = E + G
  1691. A.. = F + A. B.. = B. - H
  1692. F. = F - A. H. = B. + H
  1693. R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
  1694. R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
  1695. To:
  1696. A = (C1 * I1) B = (C7 * I1)
  1697. C = 0 D = 0
  1698. A. = C4 * A B. = C4 * B
  1699. C. = A D. = B
  1700. E = C4 * I0 F = E
  1701. G = 0 H = 0
  1702. E. = E
  1703. G. = E
  1704. A.. = E + A. B.. = B.
  1705. F. = E - A. H. = B.
  1706. R0 = E + A R1 = E + A. + B. R3 = E + B R5 = E - A. + B.
  1707. R7 = E - A R2 = E + A. - B. R4 = E - B R6 = F - A. - B.
  1708. ******************************************************************************************/