1
0

denormal.h 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. #ifndef _WDL_DENORMAL_H_
  2. #define _WDL_DENORMAL_H_
  3. typedef struct
  4. {
  5. #ifdef __ppc__ // todo: other big endian platforms...
  6. unsigned int hw;
  7. unsigned int lw;
  8. #else
  9. unsigned int lw;
  10. unsigned int hw;
  11. #endif
  12. } WDL_DenormalTwoInts;
  13. typedef union { double fl; WDL_DenormalTwoInts w; } WDL_DenormalDoubleAccess;
  14. typedef union { float fl; unsigned int w; } WDL_DenormalFloatAccess;
  15. // note: the _aggressive versions filter out anything less than around 1.0e-16 or so (approximately) to 0.0, including -0.0 (becomes 0.0)
  16. // note: new! the _aggressive versions also filter inf and NaN to 0.0
  17. #ifdef __cplusplus
  18. #define WDL_DENORMAL_INLINE inline
  19. #elif defined(_MSC_VER)
  20. #define WDL_DENORMAL_INLINE __inline
  21. #else
  22. #define WDL_DENORMAL_INLINE
  23. #endif
  24. #define WDL_DENORMAL_DOUBLE_HW(a) (((const WDL_DenormalDoubleAccess*)(a))->w.hw)
  25. #define WDL_DENORMAL_DOUBLE_LW(a) (((const WDL_DenormalDoubleAccess*)(a))->w.lw)
  26. #define WDL_DENORMAL_FLOAT_W(a) (((const WDL_DenormalFloatAccess*)(a))->w)
  27. #define WDL_DENORMAL_DOUBLE_HW_NC(a) (((WDL_DenormalDoubleAccess*)(a))->w.hw)
  28. #define WDL_DENORMAL_DOUBLE_LW_NC(a) (((WDL_DenormalDoubleAccess*)(a))->w.lw)
  29. #define WDL_DENORMAL_FLOAT_W_NC(a) (((WDL_DenormalFloatAccess*)(a))->w)
  30. #define WDL_DENORMAL_DOUBLE_AGGRESSIVE_CUTOFF 0x3cA00000 // 0x3B8000000 maybe instead? that's 10^-5 smaller or so
  31. #define WDL_DENORMAL_FLOAT_AGGRESSIVE_CUTOFF 0x25000000
  32. // define WDL_DENORMAL_WANTS_SCOPED_FTZ, and then use a WDL_denormal_ftz_scope in addition to denormal_*(), then
  33. // if FTZ is available it will be used instead...
  34. //
  35. #ifdef WDL_DENORMAL_WANTS_SCOPED_FTZ
  36. #if defined(__SSE2__) || _M_IX86_FP >= 2 || defined(_WIN64)
  37. #define WDL_DENORMAL_FTZMODE
  38. #define WDL_DENORMAL_FTZSTATE_TYPE unsigned int
  39. #ifdef _MSC_VER
  40. #include <intrin.h>
  41. #else
  42. #include <xmmintrin.h>
  43. #endif
  44. #define wdl_denorm_mm_getcsr() _mm_getcsr()
  45. #define wdl_denorm_mm_setcsr(x) _mm_setcsr(x)
  46. #if defined(__SSE3__)
  47. #define wdl_denorm_mm_csr_mask ((1<<15)|(1<<11) | (1<<8) | (1<<6)) // FTZ, underflow, denormal mask, DAZ
  48. #else
  49. #define wdl_denorm_mm_csr_mask ((1<<15)|(1<<11)) // FTZ and underflow only (target SSE2)
  50. #endif
  51. #elif defined(__arm__) || defined(__aarch64__)
  52. #define WDL_DENORMAL_FTZMODE
  53. #define WDL_DENORMAL_FTZSTATE_TYPE unsigned long
  54. static unsigned long __attribute__((unused)) wdl_denorm_mm_getcsr()
  55. {
  56. unsigned long rv;
  57. #ifdef __aarch64__
  58. asm volatile ( "mrs %0, fpcr" : "=r" (rv));
  59. #else
  60. asm volatile ( "fmrx %0, fpscr" : "=r" (rv));
  61. #endif
  62. return rv;
  63. }
  64. static void __attribute__((unused)) wdl_denorm_mm_setcsr(unsigned long v)
  65. {
  66. #ifdef __aarch64__
  67. asm volatile ( "msr fpcr, %0" :: "r"(v));
  68. #else
  69. asm volatile ( "fmxr fpscr, %0" :: "r"(v));
  70. #endif
  71. }
  72. #define wdl_denorm_mm_csr_mask (1<<24)
  73. #endif
  74. class WDL_denormal_ftz_scope
  75. {
  76. public:
  77. WDL_denormal_ftz_scope()
  78. {
  79. #ifdef WDL_DENORMAL_FTZMODE
  80. const WDL_DENORMAL_FTZSTATE_TYPE b = wdl_denorm_mm_csr_mask;
  81. old_state = wdl_denorm_mm_getcsr();
  82. if ((need_restore = (old_state & b) != b))
  83. wdl_denorm_mm_setcsr(old_state|b);
  84. #endif
  85. }
  86. ~WDL_denormal_ftz_scope()
  87. {
  88. #ifdef WDL_DENORMAL_FTZMODE
  89. if (need_restore) wdl_denorm_mm_setcsr(old_state);
  90. #endif
  91. }
  92. #ifdef WDL_DENORMAL_FTZMODE
  93. WDL_DENORMAL_FTZSTATE_TYPE old_state;
  94. bool need_restore;
  95. #endif
  96. };
  97. #endif
  98. #if !defined(WDL_DENORMAL_FTZMODE) && !defined(WDL_DENORMAL_DO_NOT_FILTER)
  99. static double WDL_DENORMAL_INLINE denormal_filter_double(double a)
  100. {
  101. return (WDL_DENORMAL_DOUBLE_HW(&a)&0x7ff00000) ? a : 0.0;
  102. }
  103. static double WDL_DENORMAL_INLINE denormal_filter_double2(double a)
  104. {
  105. return ((WDL_DENORMAL_DOUBLE_HW(&a)+0x100000)&0x7ff00000) > 0x100000 ? a : 0.0;
  106. }
  107. static double WDL_DENORMAL_INLINE denormal_filter_double_aggressive(double a)
  108. {
  109. return ((WDL_DENORMAL_DOUBLE_HW(&a)+0x100000)&0x7ff00000) >= WDL_DENORMAL_DOUBLE_AGGRESSIVE_CUTOFF ? a : 0.0;
  110. }
  111. static float WDL_DENORMAL_INLINE denormal_filter_float(float a)
  112. {
  113. return (WDL_DENORMAL_FLOAT_W(&a)&0x7f800000) ? a : 0.0f;
  114. }
  115. static float WDL_DENORMAL_INLINE denormal_filter_float2(float a)
  116. {
  117. return ((WDL_DENORMAL_FLOAT_W(&a)+0x800000)&0x7f800000) > 0x800000 ? a : 0.0f;
  118. }
  119. static float WDL_DENORMAL_INLINE denormal_filter_float_aggressive(float a)
  120. {
  121. return ((WDL_DENORMAL_FLOAT_W(&a)+0x800000)&0x7f800000) >= WDL_DENORMAL_FLOAT_AGGRESSIVE_CUTOFF ? a : 0.0f;
  122. }
  123. static void WDL_DENORMAL_INLINE denormal_fix_double(double *a)
  124. {
  125. if (!(WDL_DENORMAL_DOUBLE_HW(a)&0x7ff00000)) *a=0.0;
  126. }
  127. static void WDL_DENORMAL_INLINE denormal_fix_double_aggressive(double *a)
  128. {
  129. if (((WDL_DENORMAL_DOUBLE_HW(a)+0x100000)&0x7ff00000) < WDL_DENORMAL_DOUBLE_AGGRESSIVE_CUTOFF) *a=0.0;
  130. }
  131. static void WDL_DENORMAL_INLINE denormal_fix_float(float *a)
  132. {
  133. if (!(WDL_DENORMAL_FLOAT_W(a)&0x7f800000)) *a=0.0f;
  134. }
  135. static void WDL_DENORMAL_INLINE denormal_fix_float_aggressive(float *a)
  136. {
  137. if (((WDL_DENORMAL_FLOAT_W(a)+0x800000)&0x7f800000) < WDL_DENORMAL_FLOAT_AGGRESSIVE_CUTOFF) *a=0.0f;
  138. }
  139. #ifdef __cplusplus // automatic typed versions (though one should probably use the explicit versions...
  140. static double WDL_DENORMAL_INLINE denormal_filter(double a)
  141. {
  142. return (WDL_DENORMAL_DOUBLE_HW(&a)&0x7ff00000) ? a : 0.0;
  143. }
  144. static double WDL_DENORMAL_INLINE denormal_filter_aggressive(double a)
  145. {
  146. return ((WDL_DENORMAL_DOUBLE_HW(&a)+0x100000)&0x7ff00000) >= WDL_DENORMAL_DOUBLE_AGGRESSIVE_CUTOFF ? a : 0.0;
  147. }
  148. static float WDL_DENORMAL_INLINE denormal_filter(float a)
  149. {
  150. return (WDL_DENORMAL_FLOAT_W(&a)&0x7f800000) ? a : 0.0f;
  151. }
  152. static float WDL_DENORMAL_INLINE denormal_filter_aggressive(float a)
  153. {
  154. return ((WDL_DENORMAL_FLOAT_W(&a)+0x800000)&0x7f800000) >= WDL_DENORMAL_FLOAT_AGGRESSIVE_CUTOFF ? a : 0.0f;
  155. }
  156. static void WDL_DENORMAL_INLINE denormal_fix(double *a)
  157. {
  158. if (!(WDL_DENORMAL_DOUBLE_HW(a)&0x7ff00000)) *a=0.0;
  159. }
  160. static void WDL_DENORMAL_INLINE denormal_fix_aggressive(double *a)
  161. {
  162. if (((WDL_DENORMAL_DOUBLE_HW(a)+0x100000)&0x7ff00000) < WDL_DENORMAL_DOUBLE_AGGRESSIVE_CUTOFF) *a=0.0;
  163. }
  164. static void WDL_DENORMAL_INLINE denormal_fix(float *a)
  165. {
  166. if (!(WDL_DENORMAL_FLOAT_W(a)&0x7f800000)) *a=0.0f;
  167. }
  168. static void WDL_DENORMAL_INLINE denormal_fix_aggressive(float *a)
  169. {
  170. if (((WDL_DENORMAL_FLOAT_W(a)+0x800000)&0x7f800000) < WDL_DENORMAL_FLOAT_AGGRESSIVE_CUTOFF) *a=0.0f;
  171. }
  172. #endif // cplusplus versions
  173. #else // end of !WDL_DENORMAL_DO_NOT_FILTER (and other platform-specific checks)
  174. #define denormal_filter(x) (x)
  175. #define denormal_filter2(x) (x)
  176. #define denormal_filter_double(x) (x)
  177. #define denormal_filter_double2(x) (x)
  178. #define denormal_filter_double_aggressive(x) (x)
  179. #define denormal_filter_float(x) (x)
  180. #define denormal_filter_float2(x) (x)
  181. #define denormal_filter_float_aggressive(x) (x)
  182. #define denormal_filter_aggressive(x) (x)
  183. #define denormal_fix(x) do { } while(0)
  184. #define denormal_fix_aggressive(x) do { } while(0)
  185. #define denormal_fix_double(x) do { } while(0)
  186. #define denormal_fix_double_aggressive(x) do { } while(0)
  187. #define denormal_fix_float(x) do { } while(0)
  188. #define denormal_fix_float_aggressive(x) do { } while(0)
  189. #endif
  190. ////////////////////
  191. // this isnt a denormal function but it is similar, so we'll put it here as a bonus
  192. static void WDL_DENORMAL_INLINE GetDoubleMaxAbsValue(double *out, const double *in) // note: the value pointed to by "out" must be >=0.0, __NOT__ <= -0.0
  193. {
  194. unsigned int hw = WDL_DENORMAL_DOUBLE_HW(in)&0x7fffffff;
  195. if (hw >= WDL_DENORMAL_DOUBLE_HW(out) && (hw>WDL_DENORMAL_DOUBLE_HW(out) || WDL_DENORMAL_DOUBLE_LW(in) > WDL_DENORMAL_DOUBLE_LW(out)))
  196. {
  197. WDL_DENORMAL_DOUBLE_LW_NC(out) = WDL_DENORMAL_DOUBLE_LW(in);
  198. WDL_DENORMAL_DOUBLE_HW_NC(out) = hw;
  199. }
  200. }
  201. static void WDL_DENORMAL_INLINE GetFloatMaxAbsValue(float *out, const float *in) // note: the value pointed to by "out" must be >=0.0, __NOT__ <= -0.0
  202. {
  203. unsigned int hw = WDL_DENORMAL_FLOAT_W(in)&0x7fffffff;
  204. if (hw > WDL_DENORMAL_FLOAT_W(out)) WDL_DENORMAL_FLOAT_W_NC(out)=hw;
  205. }
  206. #ifdef __cplusplus
  207. static void WDL_DENORMAL_INLINE GetFloatMaxAbsValue(double *out, const double *in) // note: the value pointed to by "out" must be >=0.0, __NOT__ <= -0.0
  208. {
  209. GetDoubleMaxAbsValue(out,in);
  210. }
  211. #endif
  212. #endif