asm_math.h 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. #ifndef __asm_math_H__
  2. #define __asm_math_H__
  3. #include "OgrePrerequisites.h"
  4. #if OGRE_COMPILER == OGRE_COMPILER_MSVC
  5. # pragma warning (push)
  6. // disable "instruction may be inaccurate on some Pentiums"
  7. # pragma warning (disable : 4725)
  8. #endif
  9. namespace Ogre
  10. {
  11. /*=============================================================================
  12. ASM math routines posted by davepermen et al on flipcode forums
  13. =============================================================================*/
  14. const float pi = 4.0f * atan( 1.0f );
  15. const float half_pi = 0.5f * pi;
  16. /*=============================================================================
  17. NO EXPLICIT RETURN REQUIRED FROM THESE METHODS!!
  18. =============================================================================*/
  19. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  20. # pragma warning( push )
  21. # pragma warning( disable: 4035 )
  22. #endif
  23. float asm_arccos( float r ) {
  24. // return half_pi + arctan( r / -sqr( 1.f - r * r ) );
  25. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  26. float asm_one = 1.f;
  27. float asm_half_pi = half_pi;
  28. __asm {
  29. fld r // r0 = r
  30. fld r // r1 = r0, r0 = r
  31. fmul r // r0 = r0 * r
  32. fsubr asm_one // r0 = r0 - 1.f
  33. fsqrt // r0 = sqrtf( r0 )
  34. fchs // r0 = - r0
  35. fdiv // r0 = r1 / r0
  36. fld1 // {{ r0 = atan( r0 )
  37. fpatan // }}
  38. fadd asm_half_pi // r0 = r0 + pi / 2
  39. } // returns r0
  40. #else
  41. return float( acos( r ) );
  42. #endif
  43. }
  44. float asm_arcsin( float r ) {
  45. // return arctan( r / sqr( 1.f - r * r ) );
  46. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  47. const float asm_one = 1.f;
  48. __asm {
  49. fld r // r0 = r
  50. fld r // r1 = r0, r0 = r
  51. fmul r // r0 = r0 * r
  52. fsubr asm_one // r0 = r0 - 1.f
  53. fsqrt // r0 = sqrtf( r0 )
  54. fdiv // r0 = r1 / r0
  55. fld1 // {{ r0 = atan( r0 )
  56. fpatan // }}
  57. } // returns r0
  58. #else
  59. return float( asin( r ) );
  60. #endif
  61. }
  62. float asm_arctan( float r ) {
  63. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  64. __asm {
  65. fld r // r0 = r
  66. fld1 // {{ r0 = atan( r0 )
  67. fpatan // }}
  68. } // returns r0
  69. #else
  70. return float( atan( r ) );
  71. #endif
  72. }
  73. float asm_sin( float r ) {
  74. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  75. __asm {
  76. fld r // r0 = r
  77. fsin // r0 = sinf( r0 )
  78. } // returns r0
  79. #else
  80. return sin( r );
  81. #endif
  82. }
  83. float asm_cos( float r ) {
  84. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  85. __asm {
  86. fld r // r0 = r
  87. fcos // r0 = cosf( r0 )
  88. } // returns r0
  89. #else
  90. return cos( r );
  91. #endif
  92. }
  93. float asm_tan( float r ) {
  94. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  95. // return sin( r ) / cos( r );
  96. __asm {
  97. fld r // r0 = r
  98. fsin // r0 = sinf( r0 )
  99. fld r // r1 = r0, r0 = r
  100. fcos // r0 = cosf( r0 )
  101. fdiv // r0 = r1 / r0
  102. } // returns r0
  103. #else
  104. return tan( r );
  105. #endif
  106. }
  107. // returns a for a * a = r
  108. float asm_sqrt( float r )
  109. {
  110. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  111. __asm {
  112. fld r // r0 = r
  113. fsqrt // r0 = sqrtf( r0 )
  114. } // returns r0
  115. #else
  116. return sqrt( r );
  117. #endif
  118. }
  119. // returns 1 / a for a * a = r
  120. // -- Use this for Vector normalisation!!!
  121. float asm_rsq( float r )
  122. {
  123. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  124. __asm {
  125. fld1 // r0 = 1.f
  126. fld r // r1 = r0, r0 = r
  127. fsqrt // r0 = sqrtf( r0 )
  128. fdiv // r0 = r1 / r0
  129. } // returns r0
  130. #else
  131. return 1. / sqrt( r );
  132. #endif
  133. }
  134. // returns 1 / a for a * a = r
  135. // Another version
  136. float apx_rsq( float r ) {
  137. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  138. const float asm_dot5 = 0.5f;
  139. const float asm_1dot5 = 1.5f;
  140. __asm {
  141. fld r // r0 = r
  142. fmul asm_dot5 // r0 = r0 * .5f
  143. mov eax, r // eax = r
  144. shr eax, 0x1 // eax = eax >> 1
  145. neg eax // eax = -eax
  146. add eax, 0x5F400000 // eax = eax & MAGICAL NUMBER
  147. mov r, eax // r = eax
  148. fmul r // r0 = r0 * r
  149. fmul r // r0 = r0 * r
  150. fsubr asm_1dot5 // r0 = 1.5f - r0
  151. fmul r // r0 = r0 * r
  152. } // returns r0
  153. #else
  154. return 1. / sqrt( r );
  155. #endif
  156. }
  157. /* very MS-specific, commented out for now
  158. Finally the best InvSqrt implementation?
  159. Use for vector normalisation instead of 1/length() * x,y,z
  160. */
  161. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  162. __declspec(naked) float __fastcall InvSqrt(float fValue)
  163. {
  164. __asm
  165. {
  166. mov eax, 0be6eb508h
  167. mov dword ptr[esp-12],03fc00000h
  168. sub eax, dword ptr[esp + 4]
  169. sub dword ptr[esp+4], 800000h
  170. shr eax, 1
  171. mov dword ptr[esp - 8], eax
  172. fld dword ptr[esp - 8]
  173. fmul st, st
  174. fld dword ptr[esp - 8]
  175. fxch st(1)
  176. fmul dword ptr[esp + 4]
  177. fld dword ptr[esp - 12]
  178. fld st(0)
  179. fsub st,st(2)
  180. fld st(1)
  181. fxch st(1)
  182. fmul st(3),st
  183. fmul st(3),st
  184. fmulp st(4),st
  185. fsub st,st(2)
  186. fmul st(2),st
  187. fmul st(3),st
  188. fmulp st(2),st
  189. fxch st(1)
  190. fsubp st(1),st
  191. fmulp st(1), st
  192. ret 4
  193. }
  194. }
  195. #endif
  196. // returns a random number
  197. FORCEINLINE float asm_rand()
  198. {
  199. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  200. #if 0
  201. #if OGRE_COMP_VER >= 1300
  202. static unsigned __int64 q = time( NULL );
  203. _asm {
  204. movq mm0, q
  205. // do the magic MMX thing
  206. pshufw mm1, mm0, 0x1E
  207. paddd mm0, mm1
  208. // move to integer memory location and free MMX
  209. movq q, mm0
  210. emms
  211. }
  212. return float( q );
  213. #endif
  214. #else
  215. // VC6 does not support pshufw
  216. return float( rand() );
  217. #endif
  218. #else
  219. // GCC etc
  220. return float( rand() );
  221. #endif
  222. }
  223. // returns the maximum random number
  224. FORCEINLINE float asm_rand_max()
  225. {
  226. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  227. #if 0
  228. #if OGRE_COMP_VER >= 1300
  229. return (std::numeric_limits< unsigned __int64 >::max)();
  230. return 9223372036854775807.0f;
  231. #endif
  232. #else
  233. // VC6 does not support unsigned __int64
  234. return float( RAND_MAX );
  235. #endif
  236. #else
  237. // GCC etc
  238. return float( RAND_MAX );
  239. #endif
  240. }
  241. // returns log2( r ) / log2( e )
  242. float asm_ln( float r ) {
  243. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  244. const float asm_1_div_log2_e = .693147180559f;
  245. const float asm_neg1_div_3 = -.33333333333333333333333333333f;
  246. const float asm_neg2_div_3 = -.66666666666666666666666666667f;
  247. const float asm_2 = 2.f;
  248. int log_2 = 0;
  249. __asm {
  250. // log_2 = ( ( r >> 0x17 ) & 0xFF ) - 0x80;
  251. mov eax, r
  252. sar eax, 0x17
  253. and eax, 0xFF
  254. sub eax, 0x80
  255. mov log_2, eax
  256. // r = ( r & 0x807fffff ) + 0x3f800000;
  257. mov ebx, r
  258. and ebx, 0x807FFFFF
  259. add ebx, 0x3F800000
  260. mov r, ebx
  261. // r = ( asm_neg1_div_3 * r + asm_2 ) * r + asm_neg2_div_3; // (1)
  262. fld r
  263. fmul asm_neg1_div_3
  264. fadd asm_2
  265. fmul r
  266. fadd asm_neg2_div_3
  267. fild log_2
  268. fadd
  269. fmul asm_1_div_log2_e
  270. }
  271. #else
  272. return log( r );
  273. #endif
  274. }
  275. #if OGRE_COMPILER == OGRE_COMPILER_MSVC && OGRE_ARCH_TYPE == OGRE_ARCHITECTURE_32
  276. # pragma warning( pop )
  277. #endif
  278. } // namespace
  279. #if OGRE_COMPILER == OGRE_COMPILER_MSVC
  280. # pragma warning (pop)
  281. #endif
  282. #endif