wmtrecon.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. /****************************************************************************
  2. *
  3. * Module Title : WmtOptFunctions.c
  4. *
  5. * Description : willamette processor specific
  6. * optimised versions of functions
  7. *
  8. * AUTHOR : Yaowu Xu
  9. *
  10. * Special Note:
  11. *
  12. *****************************************************************************
  13. * Revision History
  14. *
  15. *
  16. * 1.03 YWX 07-Dec-00 Removed constants and functions that are not in use
  17. * Added push and pop ebx in WmtReconIntra
  18. * 1.02 YWX 30 Aug 00 changed to be compatible with Microsoft compiler
  19. * 1.01 YWX 13 JUL 00 New Willamette Optimized Functions
  20. * 1.00 YWX 14/06/00 Configuration baseline from OptFunctions.c
  21. *
  22. *****************************************************************************
  23. */
  24. /*
  25. Use Tim's optimized version.
  26. */
  27. /****************************************************************************
  28. * Header Files
  29. *****************************************************************************
  30. */
  31. #define STRICT // Strict type checking.
  32. #include "reconstruct.h"
  33. /****************************************************************************
  34. * Module constants.
  35. *****************************************************************************
  36. */
  37. /****************************************************************************
  38. * Imports.
  39. *****************************************************************************
  40. */
  41. /****************************************************************************
  42. * Exported Global Variables
  43. *****************************************************************************
  44. */
  45. /****************************************************************************
  46. * Exported Functions
  47. *****************************************************************************
  48. */
  49. /****************************************************************************
  50. * Module Statics
  51. *****************************************************************************
  52. */
  53. _declspec(align(16)) static UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
  54. #pragma warning( disable : 4799 ) // Disable no emms instruction warning!
  55. /****************************************************************************
  56. * Forward References
  57. *****************************************************************************
  58. */
  59. /****************************************************************************
  60. *
  61. * ROUTINE : WmtReconIntra
  62. *
  63. * INPUTS : INT16 * idct
  64. * Pointer to the output from the idct for this block
  65. *
  66. * UINT32 stride
  67. * Line Length in pixels in recon and reference images
  68. *
  69. *
  70. *
  71. *
  72. * OUTPUTS : UINT8 * dest
  73. * The reconstruction buffer
  74. *
  75. * RETURNS : None
  76. *
  77. * FUNCTION : Reconstructs an intra block - wmt version
  78. *
  79. *
  80. * ERRORS : None.
  81. *
  82. ****************************************************************************/
  83. void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
  84. {
  85. (void)TmpDataBuffer;
  86. __asm
  87. {
  88. push ebx
  89. mov eax,[idct] ; Signed 16 bit inputs
  90. mov edx,[dest] ; Unsigned 8 bit outputs
  91. movq xmm0,QWORD PTR [Eight128s] ; Set xmm0 to 0x000000000000008080808080808080
  92. pxor xmm3, xmm3 ; set xmm3 to 0
  93. ;
  94. mov ebx,[stride] ; Line stride in output buffer
  95. lea ecx,[eax+128] ; Endpoint in input buffer
  96. loop_label:
  97. movdqa xmm2,XMMWORD PTR [eax] ; Read the eight inputs
  98. packsswb xmm2,xmm3 ;
  99. pxor xmm2,xmm0 ; Convert result to unsigned (same as add 128)
  100. lea eax,[eax + 16] ; Step source buffer
  101. cmp eax,ecx ; are we done
  102. movq QWORD PTR [edx],xmm2 ; store results
  103. lea edx,[edx+ebx] ; Step output buffer
  104. jc loop_label ; Loop back if we are not done
  105. pop ebx
  106. }
  107. }
  108. /****************************************************************************
  109. *
  110. * ROUTINE : WmtReconInter
  111. *
  112. * INPUTS : UINT8 * RefPtr
  113. * The last frame reference
  114. *
  115. * INT16 * ChangePtr
  116. * Pointer to the change data
  117. *
  118. * UINT32 LineStep
  119. * Line Length in pixels in recon and ref images
  120. *
  121. * OUTPUTS : UINT8 * ReconPtr
  122. * The reconstruction
  123. *
  124. * RETURNS : None
  125. *
  126. * FUNCTION : Reconstructs data from last data and change
  127. *
  128. * SPECIAL NOTES :
  129. *
  130. *
  131. * ERRORS : None.
  132. *
  133. ****************************************************************************/
  134. void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
  135. {
  136. (void) TmpDataBuffer;
  137. _asm {
  138. push edi
  139. mov ebx, [RefPtr]
  140. mov ecx, [ChangePtr]
  141. mov eax, [ReconPtr]
  142. mov edx, [LineStep]
  143. pxor xmm0, xmm0
  144. lea edi, [ecx + 128]
  145. L:
  146. movq xmm2, QWORD ptr [ebx] ; (+3 misaligned) 8 reference pixels
  147. movdqa xmm4, XMMWORD ptr [ecx] ; 8 changes
  148. punpcklbw xmm2, xmm0 ;
  149. add ebx, edx ; next row of reference pixels
  150. paddsw xmm2, xmm4 ; add in first 4 changes
  151. lea ecx, [ecx + 16] ; next row of changes
  152. packuswb xmm2, xmm0 ; pack result to unsigned 8-bit values
  153. cmp ecx, edi ; are we done?
  154. movq QWORD PTR [eax], xmm2 ; store result
  155. lea eax, [eax+edx] ; next row of output
  156. jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
  157. pop edi
  158. }
  159. }
  160. /****************************************************************************
  161. *
  162. * ROUTINE : WmtReconInterHalfPixel2
  163. *
  164. * INPUTS : UINT8 * RefPtr1, RefPtr2
  165. * The last frame reference
  166. *
  167. * INT16 * ChangePtr
  168. * Pointer to the change data
  169. *
  170. * UINT32 LineStep
  171. * Line Length in pixels in recon and ref images
  172. *
  173. *
  174. * OUTPUTS : UINT8 * ReconPtr
  175. * The reconstruction
  176. *
  177. * RETURNS : None
  178. *
  179. * FUNCTION : Reconstructs data from half pixel reference data and change.
  180. * Half pixel data interpolated from 2 references.
  181. *
  182. * SPECIAL NOTES :
  183. *
  184. *
  185. * ERRORS : None.
  186. *
  187. ****************************************************************************/
  188. void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
  189. UINT8 * RefPtr1, UINT8 * RefPtr2,
  190. INT16 * ChangePtr, UINT32 LineStep )
  191. {
  192. (void)TmpDataBuffer;
  193. _asm {
  194. push esi
  195. push edi
  196. mov ecx, [ChangePtr]
  197. mov esi, [RefPtr1]
  198. mov edi, [RefPtr2]
  199. mov ebx, [ReconPtr]
  200. mov edx, [LineStep]
  201. lea eax, [ecx+128]
  202. pxor xmm0, xmm0
  203. L:
  204. movq xmm2, QWORD PTR [esi] ; (+3 misaligned) mm2 = row from ref1
  205. movq xmm4, QWORD PTR [edi] ; (+3 misaligned) mm4 = row from ref2
  206. punpcklbw xmm2, xmm0 ;
  207. punpcklbw xmm4, xmm0 ;
  208. movdqa xmm6, [ecx] ; mm6 = first 4 changes
  209. paddw xmm2, xmm4 ; mm2 = start (ref1 + ref2)
  210. psrlw xmm2, 1 ; mm2 = start (ref1 + ref2)/2
  211. paddw xmm2, xmm6 ; add changes to start
  212. lea ecx, [ecx+16] ; next row idct
  213. packuswb xmm2, xmm0 ; pack start|end to unsigned 8-bit
  214. add esi, edx ; next row ref1
  215. add edi, edx ; next row ref2
  216. cmp ecx, eax
  217. movq QWORD PTR [ebx], xmm2 ; store result
  218. ;
  219. lea ebx, [ebx+edx]
  220. jc L
  221. pop edi
  222. pop esi
  223. }
  224. }