vputilasm.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. /****************************************************************************
  2. *
  3. * Module Title : newLoopTest_asm.c
  4. *
  5. * Description : Codec specific functions
  6. *
  7. * AUTHOR : Yaowu Xu
  8. *
  9. *****************************************************************************
  10. * Revision History
  11. *
  12. * 1.02 YWX 03-Nov-00 Changed confusing variable name
  13. * 1.01 YWX 02-Nov-00 Added the set of functions
  14. * 1.00 YWX 19-Oct-00 configuration baseline
  15. *****************************************************************************
  16. */
  17. /****************************************************************************
  18. * Header Frames
  19. *****************************************************************************
  20. */
  21. #define STRICT /* Strict type checking. */
  22. #include "codec_common.h"
  23. #include <math.h>
  24. /****************************************************************************
  25. * Module constants.
  26. *****************************************************************************
  27. */
  28. #define MIN(a, b) (((a) < (b)) ? (a) : (b))
  29. /****************************************************************************
  30. * Explicit Imports
  31. *****************************************************************************
  32. */
  33. extern void SatUnsigned8( UINT8 * ResultPtr, INT16 * DataBlock,
  34. UINT32 ResultLineStep, UINT32 DataLineStep );
  35. /****************************************************************************
  36. * Exported Global Variables
  37. *****************************************************************************
  38. */
  39. /****************************************************************************
  40. * Exported Functions
  41. *****************************************************************************
  42. */
  43. /****************************************************************************
  44. * Module Statics
  45. *****************************************************************************
  46. */
  47. /****************************************************************************
  48. * Foreward References
  49. *****************************************************************************
  50. */
  51. /****************************************************************************
  52. *
  53. * ROUTINE : ClearMmx()
  54. *
  55. *
  56. * INPUTS : None
  57. *
  58. * OUTPUTS :
  59. *
  60. * RETURNS :
  61. *
  62. *
  63. * FUNCTION : Clears down the MMX state
  64. *
  65. * SPECIAL NOTES : None.
  66. *
  67. *
  68. * ERRORS : None.
  69. *
  70. ****************************************************************************/
  71. void ClearMmx(void)
  72. {
  73. __asm
  74. {
  75. emms ; Clear the MMX state.
  76. }
  77. }
  78. /****************************************************************************
  79. *
  80. * ROUTINE : CopyBlockUsingMMX
  81. *
  82. * INPUTS : None
  83. *
  84. * OUTPUTS : None
  85. *
  86. * RETURNS : None.
  87. *
  88. * FUNCTION : Copies a block from source to destination
  89. *
  90. * SPECIAL NOTES : None.
  91. *
  92. *
  93. * ERRORS : None.
  94. *
  95. ****************************************************************************/
  96. void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride)
  97. {
  98. unsigned char *s = src;
  99. unsigned char *d = dest;
  100. unsigned int stride = srcstride;
  101. // recon copy
  102. _asm
  103. {
  104. mov ecx, [stride]
  105. mov eax, [s]
  106. mov ebx, [d]
  107. lea edx, [ecx + ecx * 2]
  108. movq mm0, [eax]
  109. movq mm1, [eax + ecx]
  110. movq mm2, [eax + ecx*2]
  111. movq mm3, [eax + edx]
  112. lea eax, [eax + ecx*4]
  113. movq [ebx], mm0
  114. movq [ebx + ecx], mm1
  115. movq [ebx + ecx*2], mm2
  116. movq [ebx + edx], mm3
  117. lea ebx, [ebx + ecx * 4]
  118. movq mm0, [eax]
  119. movq mm1, [eax + ecx]
  120. movq mm2, [eax + ecx*2]
  121. movq mm3, [eax + edx]
  122. movq [ebx], mm0
  123. movq [ebx + ecx], mm1
  124. movq [ebx + ecx*2], mm2
  125. movq [ebx + edx], mm3
  126. }
  127. }
  128. /****************************************************************************
  129. *
  130. * ROUTINE : CopyBlockUsingMMX
  131. *
  132. * INPUTS : None
  133. *
  134. * OUTPUTS : None
  135. *
  136. * RETURNS : None.
  137. *
  138. * FUNCTION : Copies a block from source to destination
  139. *
  140. * SPECIAL NOTES : None.
  141. *
  142. *
  143. * ERRORS : None.
  144. *
  145. ****************************************************************************/
  146. void Copy12x12_MMX(
  147. const unsigned char *src,
  148. unsigned char *dest,
  149. unsigned int srcstride,
  150. unsigned int deststride)
  151. {
  152. int j=0;
  153. do
  154. {
  155. ((UINT32*)dest)[0] = ((UINT32*)src)[0];
  156. ((UINT32*)dest)[1] = ((UINT32*)src)[1];
  157. ((UINT32*)dest)[2] = ((UINT32*)src)[2];
  158. src+=srcstride;
  159. dest+=deststride;
  160. }
  161. while(++j<12);
  162. }
  163. /****************************************************************************
  164. /****************************************************************************
  165. *
  166. * ROUTINE : AverageBlock_MMX
  167. *
  168. * INPUTS : Two block data to be averaged
  169. *
  170. * OUTPUTS : block with the average values
  171. *
  172. * RETURNS : None.
  173. *
  174. * FUNCTION : Do pixel averages on two reference blocks
  175. *
  176. * SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
  177. *
  178. * ERRORS : None.
  179. *
  180. ****************************************************************************/
  181. void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
  182. {
  183. __asm
  184. {
  185. mov esi, ReconPtr1
  186. mov eax, ReconPtr2
  187. mov edi, ReconRefPtr
  188. mov ecx, BLOCK_HEIGHT_WIDTH
  189. mov edx, ReconPixelsPerLine
  190. pxor mm7, mm7
  191. AverageBlock_Loop:
  192. movq mm0, [esi]
  193. movq mm1, [eax]
  194. movq mm2, mm0
  195. punpcklbw mm0, mm7
  196. movq mm3, mm1
  197. punpcklbw mm1, mm7
  198. paddw mm0, mm1
  199. punpckhbw mm2, mm7
  200. psraw mm0, 1
  201. punpckhbw mm3, mm7
  202. paddw mm2, mm3
  203. movq [edi], mm0
  204. psraw mm2, 1
  205. add esi, edx
  206. add eax, edx
  207. add edi, 16
  208. movq [edi-8], mm2
  209. dec ecx
  210. jnz AverageBlock_Loop
  211. }
  212. /*
  213. UINT32 i;
  214. // For each block row
  215. for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
  216. {
  217. ReconRefPtr[0] = (INT16)((INT32)(ReconPtr1[0])+ ((INT32)ReconPtr2[0]))>>1;
  218. ReconRefPtr[1] = (INT16)((INT32)(ReconPtr1[1])+ ((INT32)ReconPtr2[1]))>>1;
  219. ReconRefPtr[2] = (INT16)((INT32)(ReconPtr1[2])+ ((INT32)ReconPtr2[2]))>>1;
  220. ReconRefPtr[3] = (INT16)((INT32)(ReconPtr1[3])+ ((INT32)ReconPtr2[3]))>>1;
  221. ReconRefPtr[4] = (INT16)((INT32)(ReconPtr1[4])+ ((INT32)ReconPtr2[4]))>>1;
  222. ReconRefPtr[5] = (INT16)((INT32)(ReconPtr1[5])+ ((INT32)ReconPtr2[5]))>>1;
  223. ReconRefPtr[6] = (INT16)((INT32)(ReconPtr1[6])+ ((INT32)ReconPtr2[6]))>>1;
  224. ReconRefPtr[7] = (INT16)((INT32)(ReconPtr1[7])+ ((INT32)ReconPtr2[7]))>>1;
  225. // Start next row
  226. ReconPtr1 += ReconPixelsPerLine;
  227. ReconPtr2 += ReconPixelsPerLine;
  228. ReconRefPtr += BLOCK_HEIGHT_WIDTH;
  229. }
  230. */
  231. }
  232. /****************************************************************************
  233. *
  234. * ROUTINE : UnpackBlock
  235. *
  236. * INPUTS : Block of char data to be converted to short
  237. *
  238. * OUTPUTS : converted output
  239. *
  240. * RETURNS : None.
  241. *
  242. * FUNCTION : Converted char block data to short
  243. *
  244. * SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
  245. *
  246. * ERRORS : None.
  247. *
  248. ****************************************************************************/
  249. void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
  250. {
  251. __asm
  252. {
  253. mov esi, ReconPtr
  254. mov edi, ReconRefPtr
  255. mov ecx, BLOCK_HEIGHT_WIDTH
  256. mov edx, ReconPixelsPerLine
  257. pxor mm7, mm7
  258. UnpackBlock_Loop:
  259. movq mm0, [esi]
  260. movq mm2, mm0
  261. punpcklbw mm0, mm7
  262. movq [edi], mm0
  263. punpckhbw mm2, mm7
  264. add esi, edx
  265. movq [edi+8], mm2
  266. add edi, 16
  267. dec ecx
  268. jnz UnpackBlock_Loop
  269. }
  270. /*
  271. UINT32 i;
  272. // For each block row
  273. for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
  274. {
  275. ReconRefPtr[0] = (INT16)(ReconPtr[0]);
  276. ReconRefPtr[1] = (INT16)(ReconPtr[1]);
  277. ReconRefPtr[2] = (INT16)(ReconPtr[2]);
  278. ReconRefPtr[3] = (INT16)(ReconPtr[3]);
  279. ReconRefPtr[4] = (INT16)(ReconPtr[4]);
  280. ReconRefPtr[5] = (INT16)(ReconPtr[5]);
  281. ReconRefPtr[6] = (INT16)(ReconPtr[6]);
  282. ReconRefPtr[7] = (INT16)(ReconPtr[7]);
  283. // Start next row
  284. ReconPtr += ReconPixelsPerLine;
  285. ReconRefPtr += BLOCK_HEIGHT_WIDTH;
  286. }
  287. */
  288. }
  289. /****************************************************************************
  290. *
  291. * ROUTINE : SubtractBlock
  292. *
  293. * INPUTS : Get the residue data for the block
  294. *
  295. * OUTPUTS : Source block data and ref block data
  296. *
  297. * RETURNS : residue block data
  298. *
  299. * FUNCTION : do pixel subtraction of ref block from source block
  300. *
  301. * SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
  302. *
  303. * ERRORS : None.
  304. *
  305. ****************************************************************************/
  306. void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep )
  307. {
  308. __asm
  309. {
  310. mov esi, SrcBlock
  311. mov edi, DestPtr
  312. mov edx, LineStep
  313. mov ecx, 8
  314. pxor mm7, mm7
  315. SubtractBlock_Loop:
  316. movq mm0, [esi]
  317. movq mm1, [edi]
  318. movq mm2, mm0
  319. punpcklbw mm0, mm7
  320. movq mm3, [edi+8]
  321. psubw mm0, mm1
  322. punpckhbw mm2, mm7
  323. movq [edi], mm0
  324. psubw mm2, mm3
  325. add esi, edx
  326. movq [edi+8], mm2
  327. add edi, 16
  328. dec ecx
  329. jnz SubtractBlock_Loop
  330. }
  331. /*
  332. UINT32 i;
  333. // For each block row
  334. for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
  335. {
  336. DestPtr[0] = (INT16)((INT32)SrcBlock[0] - (INT32)DestPtr[0]);
  337. DestPtr[1] = (INT16)((INT32)SrcBlock[1] - (INT32)DestPtr[1]);
  338. DestPtr[2] = (INT16)((INT32)SrcBlock[2] - (INT32)DestPtr[2]);
  339. DestPtr[3] = (INT16)((INT32)SrcBlock[3] - (INT32)DestPtr[3]);
  340. DestPtr[4] = (INT16)((INT32)SrcBlock[4] - (INT32)DestPtr[4]);
  341. DestPtr[5] = (INT16)((INT32)SrcBlock[5] - (INT32)DestPtr[5]);
  342. DestPtr[6] = (INT16)((INT32)SrcBlock[6] - (INT32)DestPtr[6]);
  343. DestPtr[7] = (INT16)((INT32)SrcBlock[7] - (INT32)DestPtr[7]);
  344. // Start next row
  345. SrcBlock += LineStep;
  346. DestPtr += BLOCK_HEIGHT_WIDTH;
  347. }
  348. */
  349. }
  350. /****************************************************************************
  351. *
  352. * ROUTINE : ReconBlock
  353. *
  354. * INPUTS :
  355. *
  356. * OUTPUTS :
  357. *
  358. * RETURNS :
  359. *
  360. * FUNCTION : Reconstrut a block using ref blocka and change data
  361. *
  362. * SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
  363. *
  364. * ERRORS : None.
  365. *
  366. ****************************************************************************/
  367. void ReconBlock_MMX( INT16 *SrcBlock, INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep)
  368. {
  369. __asm
  370. {
  371. mov esi, SrcBlock
  372. mov eax, ReconRefPtr
  373. mov edi, DestBlock
  374. mov ecx, 8
  375. mov edx, LineStep
  376. pxor mm7, mm7
  377. ReconBlock_Loop:
  378. movq mm0, [esi]
  379. movq mm1, [eax]
  380. movq mm2, [esi+8]
  381. movq mm3, [eax+8]
  382. paddw mm0, mm1
  383. paddw mm2, mm3
  384. packuswb mm0, mm2
  385. movq [edi], mm0
  386. add esi, 16
  387. add eax, 16
  388. add edi, edx
  389. dec ecx
  390. jnz ReconBlock_Loop
  391. }
  392. /*
  393. UINT32 i;
  394. INT16 *SrcBlockPtr = SrcBlock;
  395. // For each block row
  396. for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
  397. {
  398. SrcBlock[0] += ReconRefPtr[0];
  399. SrcBlock[1] += ReconRefPtr[1];
  400. SrcBlock[2] += ReconRefPtr[2];
  401. SrcBlock[3] += ReconRefPtr[3];
  402. SrcBlock[4] += ReconRefPtr[4];
  403. SrcBlock[5] += ReconRefPtr[5];
  404. SrcBlock[6] += ReconRefPtr[6];
  405. SrcBlock[7] += ReconRefPtr[7];
  406. // Start next row
  407. SrcBlock += BLOCK_HEIGHT_WIDTH;
  408. ReconRefPtr += BLOCK_HEIGHT_WIDTH;
  409. }
  410. // Saturated the block and write to the output
  411. SatUnsigned8( DestBlock, SrcBlockPtr, LineStep, BLOCK_HEIGHT_WIDTH );
  412. */
  413. }