mmxrecon.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. /****************************************************************************
  2. *
  3. * Module Title : OptFunctions.c
  4. *
  5. * Description : MMX or otherwise processor specific
  6. * optimised versions of functions
  7. *
  8. * AUTHOR : Paul Wilkins
  9. *
  10. *****************************************************************************
  11. * Revision History
  12. *
  13. * 1.07 JBB 26/01/01 Removed unused function
  14. * 1.06 YWX 23/05/00 Remove the clamping in MmxReconPostProcess()
  15. * 1.05 YWX 15/05/00 Added MmxReconPostProcess()
  16. * 1.04 SJL 03/14/00 Added in Tim's versions of MmxReconInter and MmxReconInterHalfPixel2.
  17. * 1.03 PGW 12/10/99 Changes to reduce uneccessary dependancies.
  18. * 1.02 PGW 30/08/99 Minor changes to MmxReconInterHalfPixel2().
  19. * 1.01 PGW 13/07/99 Changes to keep reconstruction data to 16 bit
  20. * 1.00 PGW 14/06/99 Configuration baseline
  21. *
  22. *****************************************************************************
  23. */
  24. /*
  25. Use Tim's optimized version.
  26. */
  27. #define USING_TIMS 1
  28. /****************************************************************************
  29. * Header Files
  30. *****************************************************************************
  31. */
  32. #define STRICT // Strict type checking.
  33. #include "codec_common.h"
  34. #include "reconstruct.h"
  35. /****************************************************************************
  36. * Module constants.
  37. *****************************************************************************
  38. */
  39. /****************************************************************************
  40. * Imports.
  41. *****************************************************************************
  42. */
  43. extern INT32 * XX_LUT;
  44. /****************************************************************************
  45. * Exported Global Variables
  46. *****************************************************************************
  47. */
  48. /****************************************************************************
  49. * Exported Functions
  50. *****************************************************************************
  51. */
  52. /****************************************************************************
  53. * Module Statics
  54. *****************************************************************************
  55. */
  56. INT16 Ones[4] = {1,1,1,1};
  57. INT16 OneTwoEight[4] = {128,128,128,128};
  58. UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
  59. #pragma warning( disable : 4799 ) // Disable no emms instruction warning!
  60. /****************************************************************************
  61. * Forward References
  62. *****************************************************************************
  63. */
  64. /****************************************************************************
  65. *
  66. * ROUTINE : MMXReconIntra
  67. *
  68. * INPUTS : INT16 * idct
  69. * Pointer to the output from the idct for this block
  70. *
  71. * UINT32 stride
  72. * Line Length in pixels in recon and reference images
  73. *
  74. *
  75. *
  76. *
  77. * OUTPUTS : UINT8 * dest
  78. * The reconstruction buffer
  79. *
  80. * RETURNS : None
  81. *
  82. * FUNCTION : Reconstructs an intra block - MMX version
  83. *
  84. * SPECIAL NOTES : Tim Murphy's optimized version
  85. *
  86. *
  87. * ERRORS : None.
  88. *
  89. ****************************************************************************/
  90. void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
  91. {
  92. (void) TmpDataBuffer;
  93. __asm
  94. {
  95. // u pipe
  96. // v pipe
  97. mov eax,[idct] ; Signed 16 bit inputs
  98. mov edx,[dest] ; Signed 8 bit outputs
  99. movq mm0,[Eight128s] ; Set mm0 to 0x8080808080808080
  100. ;
  101. mov ebx,[stride] ; Line stride in output buffer
  102. lea ecx,[eax+128] ; Endpoint in input buffer
  103. loop_label: ;
  104. movq mm2,[eax] ; First four input values
  105. ;
  106. packsswb mm2,[eax+8] ; pack with next(high) four values
  107. por mm0,mm0 ; stall
  108. pxor mm2,mm0 ; Convert result to unsigned (same as add 128)
  109. lea eax,[eax + 16] ; Step source buffer
  110. cmp eax,ecx ; are we done
  111. ;
  112. movq [edx],mm2 ; store results
  113. ;
  114. lea edx,[edx+ebx] ; Step output buffer
  115. jc loop_label ; Loop back if we are not done
  116. }
  117. // 6c/8 elts = 9c/8 = 1.125 c/pix
  118. }
  119. /****************************************************************************
  120. *
  121. * ROUTINE : MmxReconInter
  122. *
  123. * INPUTS : UINT8 * RefPtr
  124. * The last frame reference
  125. *
  126. * INT16 * ChangePtr
  127. * Pointer to the change data
  128. *
  129. * UINT32 LineStep
  130. * Line Length in pixels in recon and ref images
  131. *
  132. * OUTPUTS : UINT8 * ReconPtr
  133. * The reconstruction
  134. *
  135. * RETURNS : None
  136. *
  137. * FUNCTION : Reconstructs data from last data and change
  138. *
  139. * SPECIAL NOTES :
  140. *
  141. *
  142. * ERRORS : None.
  143. *
  144. ****************************************************************************/
  145. #if USING_TIMS
  146. void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
  147. {
  148. (void) TmpDataBuffer;
  149. _asm {
  150. push edi
  151. ;; mov ebx, [ref]
  152. ;; mov ecx, [diff]
  153. ;; mov eax, [dest]
  154. ;; mov edx, [stride]
  155. mov ebx, [RefPtr]
  156. mov ecx, [ChangePtr]
  157. mov eax, [ReconPtr]
  158. mov edx, [LineStep]
  159. pxor mm0, mm0
  160. lea edi, [ecx + 128]
  161. ;
  162. L:
  163. movq mm2, [ebx] ; (+3 misaligned) 8 reference pixels
  164. ;
  165. movq mm4, [ecx] ; first 4 changes
  166. movq mm3, mm2
  167. movq mm5, [ecx + 8] ; last 4 changes
  168. punpcklbw mm2, mm0 ; turn first 4 refs into positive 16-bit #s
  169. paddsw mm2, mm4 ; add in first 4 changes
  170. punpckhbw mm3, mm0 ; turn last 4 refs into positive 16-bit #s
  171. paddsw mm3, mm5 ; add in last 4 changes
  172. add ebx, edx ; next row of reference pixels
  173. packuswb mm2, mm3 ; pack result to unsigned 8-bit values
  174. lea ecx, [ecx + 16] ; next row of changes
  175. cmp ecx, edi ; are we done?
  176. ;
  177. movq [eax], mm2 ; store result
  178. ;
  179. lea eax, [eax+edx] ; next row of output
  180. jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
  181. pop edi
  182. }
  183. }
  184. #else
  185. void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
  186. {
  187. // Note that the line step for the change data is assumed to be 8 * 32 bits.
  188. __asm
  189. {
  190. // Set up data pointers
  191. mov eax,dword ptr [ReconPtr]
  192. mov ebx,dword ptr [RefPtr]
  193. mov ecx,dword ptr [ChangePtr]
  194. mov edx,dword ptr [LineStep]
  195. pxor mm6, mm6 ; Blank mmx6
  196. // Row 1
  197. // Load the data values. The change data needs to be unpacked to words
  198. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  199. movq mm1, mm0 ; Copy data
  200. punpcklbw mm0, mm6 ; Low bytes to words
  201. punpckhbw mm1, mm6 ; High bytes to words
  202. // Load 8 elements of 16 bit change data
  203. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  204. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  205. // Sum the data
  206. paddsw mm0, mm2 ; First 4 values
  207. paddsw mm1, mm4 ; Second 4 values
  208. // Pack and store
  209. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  210. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  211. add ebx,edx ; Step the reference pointer.
  212. add ecx,16 ; Step the change pointer.
  213. add eax,edx ; Step the reconstruction pointer
  214. // Row 2
  215. // Load the data values. The change data needs to be unpacked to words
  216. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  217. movq mm1, mm0 ; Copy data
  218. punpcklbw mm0, mm6 ; Low bytes to words
  219. punpckhbw mm1, mm6 ; High bytes to words
  220. // Load 8 elements of 16 bit change data
  221. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  222. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  223. // Sum the data
  224. paddsw mm0, mm2 ; First 4 values
  225. paddsw mm1, mm4 ; Second 4 values
  226. // Pack and store
  227. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  228. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  229. add ebx,edx ; Step the reference pointer.
  230. add ecx,16 ; Step the change pointer.
  231. add eax,edx ; Step the reconstruction pointer
  232. // Row 3
  233. // Load the data values. The change data needs to be unpacked to words
  234. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  235. movq mm1, mm0 ; Copy data
  236. punpcklbw mm0, mm6 ; Low bytes to words
  237. punpckhbw mm1, mm6 ; High bytes to words
  238. // Load 8 elements of 16 bit change data
  239. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  240. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  241. // Sum the data
  242. paddsw mm0, mm2 ; First 4 values
  243. paddsw mm1, mm4 ; Second 4 values
  244. // Pack and store
  245. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  246. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  247. add ebx,edx ; Step the reference pointer.
  248. add ecx,16 ; Step the change pointer.
  249. add eax,edx ; Step the reconstruction pointer
  250. // Row 4
  251. // Load the data values. The change data needs to be unpacked to words
  252. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  253. movq mm1, mm0 ; Copy data
  254. punpcklbw mm0, mm6 ; Low bytes to words
  255. punpckhbw mm1, mm6 ; High bytes to words
  256. // Load 8 elements of 16 bit change data
  257. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  258. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  259. // Sum the data
  260. paddsw mm0, mm2 ; First 4 values
  261. paddsw mm1, mm4 ; Second 4 values
  262. // Pack and store
  263. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  264. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  265. add ebx,edx ; Step the reference pointer.
  266. add ecx,16 ; Step the change pointer.
  267. add eax,edx ; Step the reconstruction pointer
  268. // Row 5
  269. // Load the data values. The change data needs to be unpacked to words
  270. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  271. movq mm1, mm0 ; Copy data
  272. punpcklbw mm0, mm6 ; Low bytes to words
  273. punpckhbw mm1, mm6 ; High bytes to words
  274. // Load 8 elements of 16 bit change data
  275. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  276. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  277. // Sum the data
  278. paddsw mm0, mm2 ; First 4 values
  279. paddsw mm1, mm4 ; Second 4 values
  280. // Pack and store
  281. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  282. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  283. add ebx,edx ; Step the reference pointer.
  284. add ecx,16 ; Step the change pointer.
  285. add eax,edx ; Step the reconstruction pointer
  286. // Row 6
  287. // Load the data values. The change data needs to be unpacked to words
  288. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  289. movq mm1, mm0 ; Copy data
  290. punpcklbw mm0, mm6 ; Low bytes to words
  291. punpckhbw mm1, mm6 ; High bytes to words
  292. // Load 8 elements of 16 bit change data
  293. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  294. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  295. // Sum the data
  296. paddsw mm0, mm2 ; First 4 values
  297. paddsw mm1, mm4 ; Second 4 values
  298. // Pack and store
  299. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  300. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  301. add ebx,edx ; Step the reference pointer.
  302. add ecx,16 ; Step the change pointer.
  303. add eax,edx ; Step the reconstruction pointer
  304. // Row 7
  305. // Load the data values. The change data needs to be unpacked to words
  306. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  307. movq mm1, mm0 ; Copy data
  308. punpcklbw mm0, mm6 ; Low bytes to words
  309. punpckhbw mm1, mm6 ; High bytes to words
  310. // Load 8 elements of 16 bit change data
  311. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  312. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  313. // Sum the data
  314. paddsw mm0, mm2 ; First 4 values
  315. paddsw mm1, mm4 ; Second 4 values
  316. // Pack and store
  317. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  318. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  319. add ebx,edx ; Step the reference pointer.
  320. add ecx,16 ; Step the change pointer.
  321. add eax,edx ; Step the reconstruction pointer
  322. // Row 8
  323. // Load the data values. The change data needs to be unpacked to words
  324. movq mm0,dword ptr [ebx] ; Load 8 elements of source data
  325. movq mm1, mm0 ; Copy data
  326. punpcklbw mm0, mm6 ; Low bytes to words
  327. punpckhbw mm1, mm6 ; High bytes to words
  328. // Load 8 elements of 16 bit change data
  329. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  330. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  331. // Sum the data
  332. paddsw mm0, mm2 ; First 4 values
  333. paddsw mm1, mm4 ; Second 4 values
  334. // Pack and store
  335. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  336. movq dword ptr [eax],mm0 ; Write the data out to the results buffer
  337. //emms ; Clear the MMX state.
  338. }
  339. }
  340. #endif
  341. /****************************************************************************
  342. *
  343. * ROUTINE : MmxReconInterHalfPixel2
  344. *
  345. * INPUTS : UINT8 * RefPtr1, RefPtr2
  346. * The last frame reference
  347. *
  348. * INT16 * ChangePtr
  349. * Pointer to the change data
  350. *
  351. * UINT32 LineStep
  352. * Line Length in pixels in recon and ref images
  353. *
  354. *
  355. * OUTPUTS : UINT8 * ReconPtr
  356. * The reconstruction
  357. *
  358. * RETURNS : None
  359. *
  360. * FUNCTION : Reconstructs data from half pixel reference data and change.
  361. * Half pixel data interpolated from 2 references.
  362. *
  363. * SPECIAL NOTES :
  364. *
  365. *
  366. * ERRORS : None.
  367. *
  368. ****************************************************************************/
  369. #if USING_TIMS
  370. #define A 0
  371. void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
  372. UINT8 * RefPtr1, UINT8 * RefPtr2,
  373. INT16 * ChangePtr, UINT32 LineStep )
  374. {
  375. # if A
  376. static culong FourOnes[2] = { 65537, 65537}; // only read once
  377. # endif
  378. (void) TmpDataBuffer;
  379. _asm {
  380. push esi
  381. push edi
  382. ;; mov ecx, [diff]
  383. ;; mov esi, [ref1]
  384. ;; mov edi, [ref2]
  385. ;; mov ebx, [dest]
  386. ;; mov edx, [stride]
  387. mov ecx, [ChangePtr]
  388. mov esi, [RefPtr1]
  389. mov edi, [RefPtr2]
  390. mov ebx, [ReconPtr]
  391. mov edx, [LineStep]
  392. lea eax, [ecx+128]
  393. # if A
  394. movq mm1, [FourOnes]
  395. # endif
  396. pxor mm0, mm0
  397. L:
  398. movq mm2, [esi] ; (+3 misaligned) mm2 = row from ref1
  399. ;
  400. movq mm4, [edi] ; (+3 misaligned) mm4 = row from ref2
  401. movq mm3, mm2
  402. punpcklbw mm2, mm0 ; mm2 = start ref1 as positive 16-bit #s
  403. movq mm5, mm4
  404. movq mm6, [ecx] ; mm6 = first 4 changes
  405. punpckhbw mm3, mm0 ; mm3 = end ref1 as positive 16-bit #s
  406. movq mm7, [ecx+8] ; mm7 = last 4 changes
  407. punpcklbw mm4, mm0 ; mm4 = start ref2 as positive 16-bit #s
  408. punpckhbw mm5, mm0 ; mm5 = end ref2 as positive 16-bit #s
  409. paddw mm2, mm4 ; mm2 = start (ref1 + ref2)
  410. paddw mm3, mm5 ; mm3 = end (ref1 + ref2)
  411. # if A
  412. paddw mm2, mm1 ; rounding adjustment
  413. paddw mm3, mm1
  414. # endif
  415. psrlw mm2, 1 ; mm2 = start (ref1 + ref2)/2
  416. psrlw mm3, 1 ; mm3 = end (ref1 + ref2)/2
  417. paddw mm2, mm6 ; add changes to start
  418. paddw mm3, mm7 ; add changes to end
  419. lea ecx, [ecx+16] ; next row idct
  420. packuswb mm2, mm3 ; pack start|end to unsigned 8-bit
  421. add esi, edx ; next row ref1
  422. add edi, edx ; next row ref2
  423. cmp ecx, eax
  424. movq [ebx], mm2 ; store result
  425. ;
  426. lea ebx, [ebx+edx]
  427. jc L ; 22c / 8 elts = 33c / 8 pixels = 4.125 c/pix
  428. pop edi
  429. pop esi
  430. }
  431. }
  432. #undef A
  433. #else
  434. void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
  435. UINT8 * RefPtr1, UINT8 * RefPtr2,
  436. INT16 * ChangePtr, UINT32 LineStep )
  437. {
  438. UINT8 * TmpDataPtr = (UINT8 *)TmpDataBuffer->TmpReconBuffer;
  439. // Note that the line step for the change data is assumed to be 8 * 32 bits.
  440. __asm
  441. {
  442. pxor mm6, mm6 ; Blank mmx6
  443. // Set up data pointers
  444. mov eax,dword ptr [RefPtr1]
  445. mov ebx,dword ptr [RefPtr2]
  446. mov edx,dword ptr [LineStep]
  447. // Row 1
  448. // Load the change pointer
  449. mov ecx,dword ptr [ChangePtr]
  450. // Load the data values (Ref1 and Ref2) and unpack to signed 16 bit values
  451. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  452. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  453. movq mm1, mm0 ; Copy data
  454. movq mm3, mm2 ; Copy data
  455. punpcklbw mm0, mm6 ; Low bytes to words
  456. punpcklbw mm2, mm6 ; Low bytes to words
  457. punpckhbw mm1, mm6 ; High bytes to words
  458. punpckhbw mm3, mm6 ; High bytes to words
  459. // Average Ref1 and Ref2
  460. paddw mm0, mm2 ; First 4 values
  461. paddw mm1, mm3 ; Second 4 values
  462. psrlw mm0, 1
  463. psrlw mm1, 1
  464. // Load 8 elements of 16 bit change data
  465. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  466. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  467. // Sum the data reference and difference data
  468. paddw mm0, mm2 ; First 4 values
  469. paddw mm1, mm4 ; Second 4 values
  470. // Pack and store
  471. mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
  472. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  473. movq dword ptr [ecx],mm0 ; Write the data out to the temporary results buffer
  474. add eax,edx ; Step the reference pointers
  475. add ebx,edx
  476. // Row 2
  477. // Load the change pointer
  478. mov ecx,dword ptr [ChangePtr]
  479. add ecx,16
  480. // Load the data values (Ref1 and Ref2).
  481. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  482. movq mm1, mm0 ; Copy data
  483. punpcklbw mm0, mm6 ; Low bytes to words
  484. punpckhbw mm1, mm6 ; High bytes to words
  485. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  486. movq mm3, mm2 ; Copy data
  487. punpcklbw mm2, mm6 ; Low bytes to words
  488. punpckhbw mm3, mm6 ; High bytes to words
  489. // Average Ref1 and Ref2
  490. paddw mm0, mm2 ; First 4 values
  491. paddw mm1, mm3 ; Second 4 values
  492. psrlw mm0, 1
  493. psrlw mm1, 1
  494. // Load 8 elements of 16 bit change data
  495. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  496. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  497. // Sum the data reference and difference data
  498. paddw mm0, mm2 ; First 4 values
  499. paddw mm1, mm4 ; Second 4 values
  500. // Pack and store
  501. mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
  502. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  503. movq dword ptr [ecx+8],mm0 ; Write the data out to the temporary results buffer
  504. add eax,edx ; Step the reference pointers
  505. add ebx,edx
  506. // Row 3
  507. // Load the change pointer
  508. mov ecx,dword ptr [ChangePtr]
  509. add ecx,32
  510. // Load the data values (Ref1 and Ref2).
  511. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  512. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  513. movq mm1, mm0 ; Copy data
  514. movq mm3, mm2 ; Copy data
  515. punpcklbw mm0, mm6 ; Low bytes to words
  516. punpckhbw mm1, mm6 ; High bytes to words
  517. punpcklbw mm2, mm6 ; Low bytes to words
  518. punpckhbw mm3, mm6 ; High bytes to words
  519. // Average Ref1 and Ref2
  520. paddw mm0, mm2 ; First 4 values
  521. paddw mm1, mm3 ; Second 4 values
  522. psrlw mm0, 1
  523. psrlw mm1, 1
  524. // Load 8 elements of 16 bit change data
  525. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  526. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  527. // Sum the data reference and difference data
  528. paddw mm0, mm2 ; First 4 values
  529. paddw mm1, mm4 ; Second 4 values
  530. // Pack and store
  531. mov ecx,dword ptr [TmpDataPtr]
  532. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  533. movq dword ptr [ecx+16],mm0 ; Write the data out to the temporary results buffer
  534. add eax,edx ; Step the reference pointers
  535. add ebx,edx
  536. // Row 4
  537. // Load the change pointer
  538. mov ecx,dword ptr [ChangePtr]
  539. add ecx,48
  540. // Load the data values (Ref1 and Ref2).
  541. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  542. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  543. movq mm1, mm0 ; Copy data
  544. movq mm3, mm2 ; Copy data
  545. punpcklbw mm0, mm6 ; Low bytes to words
  546. punpckhbw mm1, mm6 ; High bytes to words
  547. punpcklbw mm2, mm6 ; Low bytes to words
  548. punpckhbw mm3, mm6 ; High bytes to words
  549. // Average Ref1 and Ref2
  550. paddw mm0, mm2 ; First 4 values
  551. paddw mm1, mm3 ; Second 4 values
  552. psrlw mm0, 1
  553. psrlw mm1, 1
  554. // Load 8 elements of 16 bit change data
  555. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  556. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  557. // Sum the data reference and difference data
  558. paddw mm0, mm2 ; First 4 values
  559. paddw mm1, mm4 ; Second 4 values
  560. // Pack and store
  561. mov ecx,dword ptr [TmpDataPtr]
  562. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  563. movq dword ptr [ecx+24],mm0 ; Write the data out to the temporary results buffer
  564. add eax,edx ; Step the reference pointers
  565. add ebx,edx
  566. // Row 5
  567. // Load the change pointer
  568. mov ecx,dword ptr [ChangePtr]
  569. add ecx,64
  570. // Load the data values (Ref1 and Ref2).
  571. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  572. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  573. movq mm1, mm0 ; Copy data
  574. movq mm3, mm2 ; Copy data
  575. punpcklbw mm0, mm6 ; Low bytes to words
  576. punpckhbw mm1, mm6 ; High bytes to words
  577. punpcklbw mm2, mm6 ; Low bytes to words
  578. punpckhbw mm3, mm6 ; High bytes to words
  579. // Average Ref1 and Ref2
  580. paddw mm0, mm2 ; First 4 values
  581. paddw mm1, mm3 ; Second 4 values
  582. psrlw mm0, 1
  583. psrlw mm1, 1
  584. // Load 8 elements of 16 bit change data
  585. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  586. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  587. // Sum the data reference and difference data
  588. paddw mm0, mm2 ; First 4 values
  589. paddw mm1, mm4 ; Second 4 values
  590. // Pack and store
  591. mov ecx,dword ptr [TmpDataPtr]
  592. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  593. movq dword ptr [ecx+32],mm0 ; Write the data out to the temporary results buffer
  594. add eax,edx ; Step the reference pointers
  595. add ebx,edx
  596. // Row 6
  597. // Load the change pointer
  598. mov ecx,dword ptr [ChangePtr]
  599. add ecx,80
  600. // Load the data values (Ref1 and Ref2).
  601. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  602. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  603. movq mm1, mm0 ; Copy data
  604. movq mm3, mm2 ; Copy data
  605. punpcklbw mm0, mm6 ; Low bytes to words
  606. punpckhbw mm1, mm6 ; High bytes to words
  607. punpcklbw mm2, mm6 ; Low bytes to words
  608. punpckhbw mm3, mm6 ; High bytes to words
  609. // Average Ref1 and Ref2
  610. paddw mm0, mm2 ; First 4 values
  611. paddw mm1, mm3 ; Second 4 values
  612. psrlw mm0, 1
  613. psrlw mm1, 1
  614. // Load 8 elements of 16 bit change data
  615. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  616. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  617. // Sum the data reference and difference data
  618. paddw mm0, mm2 ; First 4 values
  619. paddw mm1, mm4 ; Second 4 values
  620. // Pack and store
  621. mov ecx,dword ptr [TmpDataPtr]
  622. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  623. movq dword ptr [ecx+40],mm0 ; Write the data out to the temporary results buffer
  624. add eax,edx ; Step the reference pointers
  625. add ebx,edx
  626. // Row 7
  627. // Load the change pointer
  628. mov ecx,dword ptr [ChangePtr]
  629. add ecx,96
  630. // Load the data values (Ref1 and Ref2).
  631. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  632. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  633. movq mm1, mm0 ; Copy data
  634. movq mm3, mm2 ; Copy data
  635. punpcklbw mm0, mm6 ; Low bytes to words
  636. punpckhbw mm1, mm6 ; High bytes to words
  637. punpcklbw mm2, mm6 ; Low bytes to words
  638. punpckhbw mm3, mm6 ; High bytes to words
  639. // Average Ref1 and Ref2
  640. paddw mm0, mm2 ; First 4 values
  641. paddw mm1, mm3 ; Second 4 values
  642. psrlw mm0, 1
  643. psrlw mm1, 1
  644. // Load 8 elements of 16 bit change data
  645. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  646. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  647. // Sum the data reference and difference data
  648. paddw mm0, mm2 ; First 4 values
  649. paddw mm1, mm4 ; Second 4 values
  650. // Pack and store
  651. mov ecx,dword ptr [TmpDataPtr]
  652. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  653. movq dword ptr [ecx+48],mm0 ; Write the data out to the temporary results buffer
  654. add eax,edx ; Step the reference pointers
  655. add ebx,edx
  656. // Row 8
  657. // Load the change pointer
  658. mov ecx,dword ptr [ChangePtr]
  659. add ecx,112
  660. // Load the data values (Ref1 and Ref2).
  661. movq mm0,dword ptr [eax] ; Load 8 elements of source data
  662. movq mm2,dword ptr [ebx] ; Load 8 elements of source data
  663. movq mm1, mm0 ; Copy data
  664. movq mm3, mm2 ; Copy data
  665. punpcklbw mm0, mm6 ; Low bytes to words
  666. punpckhbw mm1, mm6 ; High bytes to words
  667. punpcklbw mm2, mm6 ; Low bytes to words
  668. punpckhbw mm3, mm6 ; High bytes to words
  669. // Average Ref1 and Ref2
  670. paddw mm0, mm2 ; First 4 values
  671. paddw mm1, mm3 ; Second 4 values
  672. psrlw mm0, 1
  673. psrlw mm1, 1
  674. // Load 8 elements of 16 bit change data
  675. movq mm2,dword ptr [ecx] ; Load 4 elements of change data
  676. movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
  677. // Sum the data reference and difference data
  678. paddw mm0, mm2 ; First 4 values
  679. paddw mm1, mm4 ; Second 4 values
  680. // Pack and store
  681. mov ecx,dword ptr [TmpDataPtr]
  682. packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
  683. movq dword ptr [ecx+56],mm0 ; Write the data out to the temporary results buffer
  684. // Now copy the results back to the reconstruction buffer.
  685. mov eax,dword ptr [ReconPtr] ; Load the reconstruction Pointer
  686. mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
  687. // Row 1
  688. movq mm0,dword ptr [ecx] ; Load 8 elements of results data
  689. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  690. add eax,edx ; Step the reconstruction pointer
  691. // Row 2
  692. movq mm0,dword ptr [ecx+8] ; Load 8 elements of results data
  693. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  694. add eax,edx ; Step the reconstruction pointer
  695. // Row 3
  696. movq mm0,dword ptr [ecx+16] ; Load 8 elements of results data
  697. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  698. add eax,edx ; Step the reconstruction pointer
  699. // Row 4
  700. movq mm0,dword ptr [ecx+24] ; Load 8 elements of results data
  701. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  702. add eax,edx ; Step the reconstruction pointer
  703. // Row 5
  704. movq mm0,dword ptr [ecx+32] ; Load 8 elements of results data
  705. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  706. add eax,edx ; Step the reconstruction pointer
  707. // Row 6
  708. movq mm0,dword ptr [ecx+40] ; Load 8 elements of results data
  709. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  710. add eax,edx ; Step the reconstruction pointer
  711. // Row 7
  712. movq mm0,dword ptr [ecx+48] ; Load 8 elements of results data
  713. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  714. add eax,edx ; Step the reconstruction pointer
  715. // Row 8
  716. movq mm0,dword ptr [ecx+56] ; Load 8 elements of results data
  717. movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
  718. add eax,edx ; Step the reconstruction pointer
  719. //emms
  720. }
  721. }
  722. #endif