XDSP.h 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. /*-========================================================================-_
  2. | - XDSP - |
  3. | Copyright (c) Microsoft Corporation. All rights reserved. |
  4. |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
  5. |PROJECT: XDSP MODEL: Unmanaged User-mode |
  6. |VERSION: 1.2 EXCEPT: No Exceptions |
  7. |CLASS: N / A MINREQ: WinXP, Xbox360 |
  8. |BASE: N / A DIALECT: MSC++ 14.00 |
  9. |>------------------------------------------------------------------------<|
  10. | DUTY: DSP functions with CPU extension specific optimizations |
  11. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
  12. NOTES:
  13. 1. Definition of terms:
  14. DSP: Digital Signal Processing.
  15. FFT: Fast Fourier Transform.
  16. Frame: A block of samples, one per channel,
  17. to be played simultaneously.
  18. 2. All buffer parameters must be 16-byte aligned.
  19. 3. All FFT functions support only FLOAT32 audio. */
  20. #pragma once
  21. //--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------//
  22. #include <windef.h> // general windows types
  23. #include <math.h> // trigonometric functions
  24. #if defined(_XBOX) // SIMD intrinsics
  25. #include <ppcintrinsics.h>
  26. #else
  27. #include <emmintrin.h>
  28. #endif
  29. //--------------<M-A-C-R-O-S>-----------------------------------------------//
  30. // assertion
  31. #if !defined(DSPASSERT)
  32. #if DBG
  33. #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); }
  34. #else
  35. #define DSPASSERT(exp) __assume(exp)
  36. #endif
  37. #endif
  38. // true if n is a power of 2
  39. #if !defined(ISPOWEROF2)
  40. #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 )
  41. #endif
  42. //--------------<H-E-L-P-E-R-S>---------------------------------------------//
  43. namespace XDSP {
  44. #pragma warning(push)
  45. #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings
  46. // Helper functions, used by the FFT functions.
  47. // The application need not call them directly.
  48. // primitive types
  49. typedef __m128 XVECTOR;
  50. typedef XVECTOR& XVECTORREF;
  51. typedef const XVECTOR& XVECTORREFC;
  52. // Parallel multiplication of four complex numbers, assuming
  53. // real and imaginary values are stored in separate vectors.
  54. __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
  55. {
  56. // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
  57. XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
  58. XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
  59. XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
  60. XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
  61. rResult = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
  62. iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
  63. }
  64. __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
  65. {
  66. // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
  67. XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
  68. XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
  69. XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
  70. XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
  71. r1 = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
  72. i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
  73. }
  74. // Radix-4 decimation-in-time FFT butterfly.
  75. // This version assumes that all four elements of the butterfly are
  76. // adjacent in a single vector.
  77. //
  78. // Compute the product of the complex input vector and the
  79. // 4-element DFT matrix:
  80. // | 1 1 1 1 | | (r1X,i1X) |
  81. // | 1 -j -1 j | | (r1Y,i1Y) |
  82. // | 1 -1 1 -1 | | (r1Z,i1Z) |
  83. // | 1 j -1 -j | | (r1W,i1W) |
  84. //
  85. // This matrix can be decomposed into two simpler ones to reduce the
  86. // number of additions needed. The decomposed matrices look like this:
  87. // | 1 0 1 0 | | 1 0 1 0 |
  88. // | 0 1 0 -j | | 1 0 -1 0 |
  89. // | 1 0 -1 0 | | 0 1 0 1 |
  90. // | 0 1 0 j | | 0 1 0 -1 |
  91. //
  92. // Combine as follows:
  93. // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) |
  94. // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) |
  95. // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) |
  96. // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) |
  97. //
  98. // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) |
  99. // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
  100. // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) |
  101. // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) |
  102. __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1)
  103. {
  104. // sign constants for radix-4 butterflies
  105. const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f };
  106. const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f };
  107. const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f };
  108. // calculating Temp
  109. XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)), // [r1X| r1X|r1Y| r1Y] +
  110. _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W]
  111. XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)), // [i1X| i1X|i1Y| i1Y] +
  112. _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W]
  113. // calculating Result
  114. XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2)); // [rTempZ|rTempW|iTempZ|iTempW]
  115. XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW]
  116. XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW]
  117. r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] +
  118. _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) ); // [rTempZ| iTempW|-rTempZ|-iTempW]
  119. i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] +
  120. _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) ); // [iTempZ|-rTempW|-iTempZ| rTempW]
  121. }
  122. // Radix-4 decimation-in-time FFT butterfly.
  123. // This version assumes that elements of the butterfly are
  124. // in different vectors, so that each vector in the input
  125. // contains elements from four different butterflies.
  126. // The four separate butterflies are processed in parallel.
  127. //
  128. // The calculations here are the same as the ones in the single-vector
  129. // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
  130. // they are done in parallel on sixteen independent complex values.
  131. // There is no interdependence between the vector elements:
  132. // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) |
  133. // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) |
  134. // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) |
  135. // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) |
  136. //
  137. // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
  138. // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
  139. // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
  140. // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
  141. __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0,
  142. __inout XVECTORREF r1,
  143. __inout XVECTORREF r2,
  144. __inout XVECTORREF r3,
  145. __inout XVECTORREF i0,
  146. __inout XVECTORREF i1,
  147. __inout XVECTORREF i2,
  148. __inout XVECTORREF i3,
  149. __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableReal,
  150. __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableImaginary,
  151. const UINT32 uStride, const BOOL fLast)
  152. {
  153. DSPASSERT(pUnityTableReal != NULL);
  154. DSPASSERT(pUnityTableImaginary != NULL);
  155. DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0);
  156. DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0);
  157. DSPASSERT(ISPOWEROF2(uStride));
  158. XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
  159. XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
  160. // calculating Temp
  161. rTemp0 = _mm_add_ps(r0, r2); iTemp0 = _mm_add_ps(i0, i2);
  162. rTemp2 = _mm_add_ps(r1, r3); iTemp2 = _mm_add_ps(i1, i3);
  163. rTemp1 = _mm_sub_ps(r0, r2); iTemp1 = _mm_sub_ps(i0, i2);
  164. rTemp3 = _mm_sub_ps(r1, r3); iTemp3 = _mm_sub_ps(i1, i3);
  165. rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2);
  166. rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
  167. rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
  168. rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3);
  169. // calculating Result
  170. // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
  171. vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
  172. vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
  173. vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
  174. if (fLast) {
  175. ButterflyDIT4_1(rTemp4, iTemp4);
  176. ButterflyDIT4_1(rTemp5, iTemp5);
  177. ButterflyDIT4_1(rTemp6, iTemp6);
  178. ButterflyDIT4_1(rTemp7, iTemp7);
  179. }
  180. r0 = rTemp4; i0 = iTemp4;
  181. r1 = rTemp5; i1 = iTemp5;
  182. r2 = rTemp6; i2 = iTemp6;
  183. r3 = rTemp7; i3 = iTemp7;
  184. }
  185. //--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------//
  186. ////
  187. // DESCRIPTION:
  188. // 4-sample FFT.
  189. //
  190. // PARAMETERS:
  191. // pReal - [inout] real components, must have at least uCount elements
  192. // pImaginary - [inout] imaginary components, must have at least uCount elements
  193. // uCount - [in] number of FFT iterations
  194. //
  195. // RETURN VALUE:
  196. // void
  197. ////
  198. __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  199. {
  200. DSPASSERT(pReal != NULL);
  201. DSPASSERT(pImaginary != NULL);
  202. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  203. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  204. DSPASSERT(ISPOWEROF2(uCount));
  205. for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  206. ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
  207. }
  208. }
  209. ////
  210. // DESCRIPTION:
  211. // 8-sample FFT.
  212. //
  213. // PARAMETERS:
  214. // pReal - [inout] real components, must have at least uCount*2 elements
  215. // pImaginary - [inout] imaginary components, must have at least uCount*2 elements
  216. // uCount - [in] number of FFT iterations
  217. //
  218. // RETURN VALUE:
  219. // void
  220. ////
  221. __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  222. {
  223. DSPASSERT(pReal != NULL);
  224. DSPASSERT(pImaginary != NULL);
  225. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  226. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  227. DSPASSERT(ISPOWEROF2(uCount));
  228. static XVECTOR wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f };
  229. static XVECTOR wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f };
  230. static XVECTOR wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f };
  231. static XVECTOR wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f };
  232. for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  233. XVECTOR* __restrict pR = pReal + uIndex*2;
  234. XVECTOR* __restrict pI = pImaginary + uIndex*2;
  235. XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
  236. XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
  237. XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
  238. XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
  239. ButterflyDIT4_1(oddsR, oddsI);
  240. ButterflyDIT4_1(evensR, evensI);
  241. XVECTOR r, i;
  242. vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
  243. pR[0] = _mm_add_ps(evensR, r);
  244. pI[0] = _mm_add_ps(evensI, i);
  245. vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
  246. pR[1] = _mm_add_ps(evensR, r);
  247. pI[1] = _mm_add_ps(evensI, i);
  248. }
  249. }
  250. ////
  251. // DESCRIPTION:
  252. // 16-sample FFT.
  253. //
  254. // PARAMETERS:
  255. // pReal - [inout] real components, must have at least uCount*4 elements
  256. // pImaginary - [inout] imaginary components, must have at least uCount*4 elements
  257. // uCount - [in] number of FFT iterations
  258. //
  259. // RETURN VALUE:
  260. // void
  261. ////
  262. __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  263. {
  264. DSPASSERT(pReal != NULL);
  265. DSPASSERT(pImaginary != NULL);
  266. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  267. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  268. DSPASSERT(ISPOWEROF2(uCount));
  269. XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
  270. XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
  271. for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  272. ButterflyDIT4_4(pReal[uIndex*4],
  273. pReal[uIndex*4 + 1],
  274. pReal[uIndex*4 + 2],
  275. pReal[uIndex*4 + 3],
  276. pImaginary[uIndex*4],
  277. pImaginary[uIndex*4 + 1],
  278. pImaginary[uIndex*4 + 2],
  279. pImaginary[uIndex*4 + 3],
  280. aUnityTableReal,
  281. aUnityTableImaginary,
  282. 1, TRUE);
  283. }
  284. }
  285. ////
  286. // DESCRIPTION:
  287. // 2^N-sample FFT.
  288. //
  289. // REMARKS:
  290. // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
  291. //
  292. // PARAMETERS:
  293. // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements
  294. // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
  295. // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
  296. // uLength - [in] FFT length in samples, must be a power of 2 > 16
  297. // uCount - [in] number of FFT iterations
  298. //
  299. // RETURN VALUE:
  300. // void
  301. ////
  302. inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) const XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
  303. {
  304. DSPASSERT(pReal != NULL);
  305. DSPASSERT(pImaginary != NULL);
  306. DSPASSERT(pUnityTable != NULL);
  307. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  308. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  309. DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
  310. DSPASSERT(uLength > 16);
  311. DSPASSERT(ISPOWEROF2(uLength));
  312. DSPASSERT(ISPOWEROF2(uCount));
  313. const XVECTOR* __restrict pUnityTableReal = pUnityTable;
  314. const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
  315. const UINT32 uTotal = uCount * uLength;
  316. const UINT32 uTotal_vectors = uTotal >> 2;
  317. const UINT32 uStage_vectors = uLength >> 2;
  318. const UINT32 uStage_vectors_mask = uStage_vectors - 1;
  319. const UINT32 uStride = uLength >> 4; // stride between butterfly elements
  320. const UINT32 uStrideMask = uStride - 1;
  321. const UINT32 uStride2 = uStride * 2;
  322. const UINT32 uStride3 = uStride * 3;
  323. const UINT32 uStrideInvMask = ~uStrideMask;
  324. for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
  325. const UINT32 n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask);
  326. ButterflyDIT4_4(pReal[n],
  327. pReal[n + uStride],
  328. pReal[n + uStride2],
  329. pReal[n + uStride3],
  330. pImaginary[n ],
  331. pImaginary[n + uStride],
  332. pImaginary[n + uStride2],
  333. pImaginary[n + uStride3],
  334. pUnityTableReal + (n & uStage_vectors_mask),
  335. pUnityTableImaginary + (n & uStage_vectors_mask),
  336. uStride, FALSE);
  337. }
  338. if (uLength > 16*4) {
  339. FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
  340. } else if (uLength == 16*4) {
  341. FFT16(pReal, pImaginary, uCount*4);
  342. } else if (uLength == 8*4) {
  343. FFT8(pReal, pImaginary, uCount*4);
  344. } else if (uLength == 4*4) {
  345. FFT4(pReal, pImaginary, uCount*4);
  346. }
  347. }
  348. //--------------------------------------------------------------------------//
  349. ////
  350. // DESCRIPTION:
  351. // Initializes unity roots lookup table used by FFT functions.
  352. // Once initialized, the table need not be initialized again unless a
  353. // different FFT length is desired.
  354. //
  355. // REMARKS:
  356. // The unity tables of FFT length 16 and below are hard coded into the
  357. // respective FFT functions and so need not be initialized.
  358. //
  359. // PARAMETERS:
  360. // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
  361. // uLength - [in] FFT length in frames, must be a power of 2 > 16
  362. //
  363. // RETURN VALUE:
  364. // void
  365. ////
  366. inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength)
  367. {
  368. DSPASSERT(pUnityTable != NULL);
  369. DSPASSERT(uLength > 16);
  370. DSPASSERT(ISPOWEROF2(uLength));
  371. FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable;
  372. // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
  373. do {
  374. FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length
  375. uLength >>= 2;
  376. // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
  377. // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
  378. for (UINT32 i=0; i<4; ++i) {
  379. for (UINT32 j=0; j<uLength; ++j) {
  380. UINT32 uIndex = (i*uLength) + j;
  381. pfUnityTable[uIndex] = cosf(FLOAT32(i)*FLOAT32(j)*flStep); // real component
  382. pfUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component
  383. }
  384. }
  385. pfUnityTable += uLength*8;
  386. } while (uLength > 16);
  387. }
  388. ////
  389. // DESCRIPTION:
  390. // The FFT functions generate output in bit reversed order.
  391. // Use this function to re-arrange them into order of increasing frequency.
  392. //
  393. // REMARKS:
  394. //
  395. // PARAMETERS:
  396. // pOutput - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<<uLog2Length)/4 elements
  397. // pInput - [in] input buffer, samples in bit reversed order as generated by FFT functions, cannot overlap pOutput, must have at least (1<<uLog2Length)/4 elements
  398. // uLog2Length - [in] LOG (base 2) of FFT length in samples, must be >= 2
  399. //
  400. // RETURN VALUE:
  401. // void
  402. ////
  403. inline void FFTUnswizzle (__out_ecount((1<<uLog2Length)/4) XVECTOR* __restrict pOutput, __in_ecount((1<<uLog2Length)/4) const XVECTOR* __restrict pInput, const UINT32 uLog2Length)
  404. {
  405. DSPASSERT(pOutput != NULL);
  406. DSPASSERT(pInput != NULL);
  407. DSPASSERT(uLog2Length >= 2);
  408. FLOAT32* __restrict pfOutput = (FLOAT32* __restrict)pOutput;
  409. const FLOAT32* __restrict pfInput = (const FLOAT32* __restrict)pInput;
  410. const UINT32 uLength = UINT32(1 << uLog2Length);
  411. if ((uLog2Length & 0x1) == 0) {
  412. // even powers of two
  413. for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
  414. UINT32 n = uIndex;
  415. n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 );
  416. n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 );
  417. n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 );
  418. n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
  419. n >>= (32 - uLog2Length);
  420. pfOutput[n] = pfInput[uIndex];
  421. }
  422. } else {
  423. // odd powers of two
  424. for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
  425. UINT32 n = (uIndex>>3);
  426. n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 );
  427. n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 );
  428. n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 );
  429. n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
  430. n >>= (32 - (uLog2Length-3));
  431. n |= ((uIndex & 0x7) << (uLog2Length - 3));
  432. pfOutput[n] = pfInput[uIndex];
  433. }
  434. }
  435. }
  436. ////
  437. // DESCRIPTION:
  438. // Convert complex components to polar form.
  439. //
  440. // PARAMETERS:
  441. // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
  442. // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements
  443. // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements
  444. // uLength - [in] FFT length in samples, must be a power of 2 >= 4
  445. //
  446. // RETURN VALUE:
  447. // void
  448. ////
  449. inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength)
  450. {
  451. DSPASSERT(pOutput != NULL);
  452. DSPASSERT(pInputReal != NULL);
  453. DSPASSERT(pInputImaginary != NULL);
  454. DSPASSERT(uLength >= 4);
  455. DSPASSERT(ISPOWEROF2(uLength));
  456. FLOAT32 flOneOverLength = 1.0f / uLength;
  457. // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
  458. XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
  459. for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
  460. XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
  461. XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
  462. XVECTOR vRR = _mm_mul_ps(vReal, vReal);
  463. XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary);
  464. XVECTOR vRRplusII = _mm_add_ps(vRR, vII);
  465. XVECTOR vTotal = _mm_sqrt_ps(vRRplusII);
  466. pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
  467. }
  468. }
  469. //--------------------------------------------------------------------------//
  470. ////
  471. // DESCRIPTION:
  472. // Deinterleaves audio samples such that all samples corresponding to
  473. //
  474. // REMARKS:
  475. // For example, audio of the form [LRLRLR] becomes [LLLRRR].
  476. //
  477. // PARAMETERS:
  478. // pOutput - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
  479. // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
  480. // uChannelCount - [in] number of channels, must be > 1
  481. // uFrameCount - [in] number of frames of valid data, must be > 0
  482. //
  483. // RETURN VALUE:
  484. // void
  485. ////
  486. inline void Deinterleave (__out_ecount((uChannelCount*uFrameCount)/4) XVECTOR* __restrict pOutput, __in_ecount((uChannelCount*uFrameCount)/4) const XVECTOR* __restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
  487. {
  488. DSPASSERT(pOutput != NULL);
  489. DSPASSERT(pInput != NULL);
  490. DSPASSERT(uChannelCount > 1);
  491. DSPASSERT(uFrameCount > 0);
  492. FLOAT32* __restrict pfOutput = (FLOAT32* __restrict)pOutput;
  493. const FLOAT32* __restrict pfInput = (const FLOAT32* __restrict)pInput;
  494. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  495. for (UINT32 uFrame=0; uFrame<uFrameCount; ++uFrame) {
  496. pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel];
  497. }
  498. }
  499. }
  500. ////
  501. // DESCRIPTION:
  502. // Interleaves audio samples such that all samples corresponding to
  503. //
  504. // REMARKS:
  505. // For example, audio of the form [LLLRRR] becomes [LRLRLR].
  506. //
  507. // PARAMETERS:
  508. // pOutput - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
  509. // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
  510. // uChannelCount - [in] number of channels, must be > 1
  511. // uFrameCount - [in] number of frames of valid data, must be > 0
  512. //
  513. // RETURN VALUE:
  514. // void
  515. ////
  516. inline void Interleave (__out_ecount((uChannelCount*uFrameCount)/4) XVECTOR* __restrict pOutput, __in_ecount((uChannelCount*uFrameCount)/4) const XVECTOR* __restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
  517. {
  518. DSPASSERT(pOutput != NULL);
  519. DSPASSERT(pInput != NULL);
  520. DSPASSERT(uChannelCount > 1);
  521. DSPASSERT(uFrameCount > 0);
  522. FLOAT32* __restrict pfOutput = (FLOAT32* __restrict)pOutput;
  523. const FLOAT32* __restrict pfInput = (const FLOAT32* __restrict)pInput;
  524. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  525. for (UINT32 uFrame=0; uFrame<uFrameCount; ++uFrame) {
  526. pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame];
  527. }
  528. }
  529. }
  530. //--------------------------------------------------------------------------//
  531. ////
  532. // DESCRIPTION:
  533. // This function applies a 2^N-sample FFT and unswizzles the result such
  534. // that the samples are in order of increasing frequency.
  535. // Audio is first deinterleaved if multichannel.
  536. //
  537. // PARAMETERS:
  538. // pReal - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
  539. // pImaginary - [out] imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
  540. // pUnityTable - [in] unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
  541. // uChannelCount - [in] number of channels, must be within [1, 6]
  542. // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 9]
  543. //
  544. // RETURN VALUE:
  545. // void
  546. ////
  547. inline void FFTInterleaved (__inout_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pReal, __out_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(1<<uLog2Length) const XVECTOR* __restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
  548. {
  549. DSPASSERT(pReal != NULL);
  550. DSPASSERT(pImaginary != NULL);
  551. DSPASSERT(pUnityTable != NULL);
  552. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  553. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  554. DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
  555. DSPASSERT(uChannelCount > 0 && uChannelCount <= 6);
  556. DSPASSERT(uLog2Length >= 2 && uLog2Length <= 9);
  557. XVECTOR vRealTemp[768];
  558. XVECTOR vImaginaryTemp[768];
  559. const UINT32 uLength = UINT32(1 << uLog2Length);
  560. if (uChannelCount > 1) {
  561. Deinterleave(vRealTemp, pReal, uChannelCount, uLength);
  562. } else {
  563. CopyMemory(vRealTemp, pReal, (uLength>>2)*sizeof(XVECTOR));
  564. }
  565. for (UINT32 u=0; u<uChannelCount*(uLength>>2); u++) {
  566. vImaginaryTemp[u] = _mm_setzero_ps();
  567. }
  568. if (uLength > 16) {
  569. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  570. FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
  571. }
  572. } else if (uLength == 16) {
  573. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  574. FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
  575. }
  576. } else if (uLength == 8) {
  577. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  578. FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
  579. }
  580. } else if (uLength == 4) {
  581. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  582. FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
  583. }
  584. }
  585. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  586. FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
  587. FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length);
  588. }
  589. }
  590. ////
  591. // DESCRIPTION:
  592. // This function applies a 2^N-sample inverse FFT.
  593. // Audio is interleaved if multichannel.
  594. //
  595. // PARAMETERS:
  596. // pReal - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
  597. // pImaginary - [out] imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
  598. // pUnityTable - [in] unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
  599. // uChannelCount - [in] number of channels, must be > 0
  600. // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 10]
  601. //
  602. // RETURN VALUE:
  603. // void
  604. ////
  605. inline void IFFTDeinterleaved (__inout_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pReal, __out_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(1<<uLog2Length) const XVECTOR* __restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
  606. {
  607. DSPASSERT(pReal != NULL);
  608. DSPASSERT(pImaginary != NULL);
  609. DSPASSERT(pUnityTable != NULL);
  610. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  611. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  612. DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
  613. DSPASSERT(uChannelCount > 0 && uChannelCount <= 6);
  614. DSPASSERT(uLog2Length >= 2 && uLog2Length <= 9);
  615. XVECTOR vRealTemp[768];
  616. XVECTOR vImaginaryTemp[768];
  617. const UINT32 uLength = UINT32(1 << uLog2Length);
  618. const XVECTOR vRnp = _mm_set_ps1(1.0f/uLength);
  619. const XVECTOR vRnm = _mm_set_ps1(-1.0f/uLength);
  620. for (UINT32 u=0; u<uChannelCount*(uLength>>2); u++) {
  621. vRealTemp[u] = _mm_mul_ps(pReal[u], vRnp);
  622. vImaginaryTemp[u] = _mm_mul_ps(pImaginary[u], vRnm);
  623. }
  624. if (uLength > 16) {
  625. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  626. FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
  627. }
  628. } else if (uLength == 16) {
  629. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  630. FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
  631. }
  632. } else if (uLength == 8) {
  633. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  634. FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
  635. }
  636. } else if (uLength == 4) {
  637. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  638. FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
  639. }
  640. }
  641. for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
  642. FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
  643. }
  644. if (uChannelCount > 1) {
  645. Interleave(pReal, vImaginaryTemp, uChannelCount, uLength);
  646. } else {
  647. CopyMemory(pReal, vImaginaryTemp, (uLength>>2)*sizeof(XVECTOR));
  648. }
  649. }
  650. #pragma warning(pop)
  651. }; // namespace XDSP
  652. //---------------------------------<-EOF->----------------------------------//