vp.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /*
  2. ** Command & Conquer Generals Zero Hour(tm)
  3. ** Copyright 2025 Electronic Arts Inc.
  4. **
  5. ** This program is free software: you can redistribute it and/or modify
  6. ** it under the terms of the GNU General Public License as published by
  7. ** the Free Software Foundation, either version 3 of the License, or
  8. ** (at your option) any later version.
  9. **
  10. ** This program is distributed in the hope that it will be useful,
  11. ** but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ** GNU General Public License for more details.
  14. **
  15. ** You should have received a copy of the GNU General Public License
  16. ** along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. /***********************************************************************************************
  19. *** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S ***
  20. ***********************************************************************************************
  21. * *
  22. * Project Name : wwmath *
  23. * *
  24. * $Archive:: /Commando/Code/WWMath/vp.cpp $*
  25. * *
  26. * Org Author:: Hector Yee *
  27. * *
  28. * Author : Kenny Mitchell *
  29. * *
  30. * $Modtime:: 06/26/02 4:04p $*
  31. * *
  32. * $Revision:: 12 $*
  33. * *
  34. * 06/26/02 KM Matrix name change to avoid MAX conflicts *
  35. *---------------------------------------------------------------------------------------------*/
  36. #include "vp.h"
  37. #include "vector2.h"
  38. #include "vector3.h"
  39. #include "vector4.h"
  40. #include "matrix3d.h"
  41. #include "matrix4.h"
  42. #include "wwdebug.h"
  43. #include "cpudetect.h"
  44. #include <memory.h>
  45. #define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
  46. #define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))
  47. #define TRANSPOSE(BX, BY, BZ, BW, TV) \
  48. __asm movaps TV,BZ \
  49. __asm unpcklps BZ,BW \
  50. __asm unpckhps TV,BW \
  51. __asm movaps BW,BX \
  52. __asm unpcklps BX,BY \
  53. __asm unpckhps BW,BY \
  54. __asm movaps BY,BX \
  55. __asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \
  56. __asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \
  57. __asm movaps BZ,BW \
  58. __asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \
  59. __asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)
  60. void VectorProcessorClass::Prefetch(void* address)
  61. {
  62. #if defined (__ICL) // Detect Intel compiler
  63. if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
  64. __asm {
  65. // mov edx,address
  66. // mov eax,[edx]
  67. // prefetchT1 address
  68. }
  69. }
  70. #endif
  71. }
  72. static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
  73. void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
  74. {
  75. if (count<=0) return;
  76. #if defined (__ICL) // Detect Intel compiler
  77. if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
  78. __asm {
  79. mov edx,dst
  80. mov eax,src
  81. mov ebx,mtx
  82. mov edi,count
  83. movups xmm4,[ebx+0]
  84. movups xmm5,[ebx+16]
  85. movups xmm6,[ebx+32]
  86. movups xmm7,lastrow //[ebx+48]
  87. TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);
  88. shufps xmm4,xmm4,SHUFFLE(2,1,0,0)
  89. shufps xmm5,xmm5,SHUFFLE(2,1,0,0)
  90. shufps xmm6,xmm6,SHUFFLE(2,1,0,0)
  91. shufps xmm7,xmm7,SHUFFLE(2,1,0,0)
  92. mov esi,edx
  93. _lp:
  94. test edi,edi
  95. jz _ulos
  96. test esi,0xf
  97. jz _aligned
  98. movss xmm0,[eax]
  99. movss xmm1,[eax+4]
  100. movss xmm2,[eax+8]
  101. BROADCAST(xmm0,0)
  102. BROADCAST(xmm1,0)
  103. BROADCAST(xmm2,0)
  104. mulps xmm0,xmm4
  105. mulps xmm1,xmm5
  106. mulps xmm2,xmm6
  107. addps xmm0,xmm1
  108. addps xmm0,xmm2
  109. addps xmm0,xmm7
  110. movss [edx],xmm0
  111. movhps [edx+4],xmm0
  112. add eax,12
  113. add edx,12
  114. add esi,12
  115. dec edi
  116. jmp _lp
  117. _aligned:
  118. mov esi,1
  119. mov ecx,edi
  120. and edi,3
  121. and ecx,~3
  122. jz _lp
  123. lea ecx,[ecx+ecx*2]
  124. shl ecx,2
  125. add eax,ecx
  126. add edx,ecx
  127. neg ecx
  128. cmp dword ptr [ebx+12],0
  129. jne _xlatelp
  130. cmp dword ptr [ebx+28],0
  131. jne _xlatelp
  132. cmp dword ptr [ebx+44],0
  133. jne _xlatelp
  134. jmp _noxlatelp
  135. align 16
  136. _noxlatelp:
  137. prefetchnta [eax+ecx+48]
  138. prefetchnta [eax+ecx+48+32]
  139. movss xmm0,[eax+ecx]
  140. BROADCAST(xmm0,0)
  141. movss xmm1,[eax+ecx+4]
  142. BROADCAST(xmm1,0)
  143. movss xmm2,[eax+ecx+8]
  144. BROADCAST(xmm2,0)
  145. mulps xmm0,xmm4
  146. mulps xmm1,xmm5
  147. mulps xmm2,xmm6
  148. addps xmm0,xmm1
  149. addps xmm0,xmm2
  150. movss xmm1,[eax+ecx+12]
  151. BROADCAST(xmm1,0)
  152. movss xmm2,[eax+ecx+16]
  153. BROADCAST(xmm2,0)
  154. movss xmm3,[eax+ecx+20]
  155. BROADCAST(xmm3,0)
  156. mulps xmm1,xmm4
  157. mulps xmm2,xmm5
  158. mulps xmm3,xmm6
  159. addps xmm1,xmm2
  160. addps xmm3,xmm1
  161. movss xmm0,xmm3
  162. shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
  163. movaps [edx+ecx],xmm0
  164. prefetcht0 [edx+ecx+48]
  165. prefetcht0 [edx+ecx+48+32]
  166. movss xmm0,[eax+ecx+24]
  167. BROADCAST(xmm0,0)
  168. movss xmm1,[eax+ecx+24+4]
  169. BROADCAST(xmm1,0)
  170. movss xmm2,[eax+ecx+24+8]
  171. BROADCAST(xmm2,0)
  172. mulps xmm0,xmm4
  173. mulps xmm1,xmm5
  174. mulps xmm2,xmm6
  175. addps xmm0,xmm1
  176. addps xmm0,xmm2
  177. shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
  178. movaps [edx+ecx+16],xmm3
  179. movss xmm1,[eax+ecx+24+12]
  180. BROADCAST(xmm1,0)
  181. movss xmm2,[eax+ecx+24+16]
  182. BROADCAST(xmm2,0)
  183. movss xmm3,[eax+ecx+24+20]
  184. BROADCAST(xmm3,0)
  185. mulps xmm1,xmm4
  186. mulps xmm2,xmm5
  187. mulps xmm3,xmm6
  188. addps xmm1,xmm2
  189. addps xmm1,xmm3
  190. shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
  191. movss xmm1,xmm0
  192. movaps [edx+ecx+32],xmm1
  193. add ecx,48
  194. js _noxlatelp
  195. jmp _lp
  196. align 16
  197. _xlatelp:
  198. prefetchnta [eax+ecx+48]
  199. prefetchnta [eax+ecx+48+32]
  200. movss xmm0,[eax+ecx]
  201. BROADCAST(xmm0,0)
  202. movss xmm1,[eax+ecx+4]
  203. BROADCAST(xmm1,0)
  204. movss xmm2,[eax+ecx+8]
  205. BROADCAST(xmm2,0)
  206. mulps xmm0,xmm4
  207. mulps xmm1,xmm5
  208. mulps xmm2,xmm6
  209. addps xmm0,xmm1
  210. addps xmm0,xmm2
  211. addps xmm0,xmm7
  212. movss xmm1,[eax+ecx+12]
  213. BROADCAST(xmm1,0)
  214. movss xmm2,[eax+ecx+16]
  215. BROADCAST(xmm2,0)
  216. movss xmm3,[eax+ecx+20]
  217. BROADCAST(xmm3,0)
  218. mulps xmm1,xmm4
  219. mulps xmm2,xmm5
  220. mulps xmm3,xmm6
  221. addps xmm1,xmm2
  222. addps xmm3,xmm1
  223. addps xmm3,xmm7
  224. movss xmm0,xmm3
  225. shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
  226. movaps [edx+ecx],xmm0
  227. prefetcht0 [edx+ecx+48]
  228. prefetcht0 [edx+ecx+48+32]
  229. movss xmm0,[eax+ecx+24]
  230. BROADCAST(xmm0,0)
  231. movss xmm1,[eax+ecx+24+4]
  232. BROADCAST(xmm1,0)
  233. movss xmm2,[eax+ecx+24+8]
  234. BROADCAST(xmm2,0)
  235. mulps xmm0,xmm4
  236. mulps xmm1,xmm5
  237. mulps xmm2,xmm6
  238. addps xmm0,xmm1
  239. addps xmm0,xmm2
  240. addps xmm0,xmm7
  241. shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
  242. movaps [edx+ecx+16],xmm3
  243. movss xmm1,[eax+ecx+24+12]
  244. BROADCAST(xmm1,0)
  245. movss xmm2,[eax+ecx+24+16]
  246. BROADCAST(xmm2,0)
  247. movss xmm3,[eax+ecx+24+20]
  248. BROADCAST(xmm3,0)
  249. mulps xmm1,xmm4
  250. mulps xmm2,xmm5
  251. mulps xmm3,xmm6
  252. addps xmm1,xmm2
  253. addps xmm1,xmm3
  254. addps xmm1,xmm7
  255. shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
  256. movss xmm1,xmm0
  257. movaps [edx+ecx+32],xmm1
  258. add ecx,48
  259. js _xlatelp
  260. jmp _lp
  261. _ulos:
  262. }
  263. }
  264. else
  265. #endif
  266. {
  267. mtx.mulVector3Array(src, dst, count);
  268. }
  269. }
  270. void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4x4& matrix, const int count)
  271. {
  272. if (count<=0) return;
  273. int i;
  274. for (i=0; i<count; i++)
  275. {
  276. dst[i]=matrix*src[i];
  277. }
  278. }
  279. void VectorProcessorClass::Copy(Vector2 *dst, const Vector2 *src, int count)
  280. {
  281. if (count<=0) return;
  282. memcpy(dst,src,sizeof(Vector2)*count);
  283. }
  284. void VectorProcessorClass::Copy(unsigned *dst, const unsigned *src, int count)
  285. {
  286. if (count<=0) return;
  287. memcpy(dst,src,sizeof(unsigned)*count);
  288. }
  289. void VectorProcessorClass::Copy(Vector3 *dst, const Vector3 *src, int count)
  290. {
  291. if (count<=0) return;
  292. memcpy(dst,src,sizeof(Vector3)*count);
  293. }
  294. void VectorProcessorClass::Copy(Vector4 *dst, const Vector4 *src, int count)
  295. {
  296. if (count<=0) return;
  297. memcpy(dst,src,sizeof(Vector4)*count);
  298. }
  299. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float * srca, const int count)
  300. {
  301. if (count<=0) return;
  302. int i;
  303. for (i=0; i<count; i++)
  304. {
  305. dst[i].X=src[i].X;
  306. dst[i].Y=src[i].Y;
  307. dst[i].Z=src[i].Z;
  308. dst[i].W=srca[i];
  309. }
  310. }
  311. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float srca, const int count)
  312. {
  313. if (count<=0) return;
  314. int i;
  315. for (i=0; i<count; i++)
  316. {
  317. dst[i].X=src[i].X;
  318. dst[i].Y=src[i].Y;
  319. dst[i].Z=src[i].Z;
  320. dst[i].W=srca;
  321. }
  322. }
  323. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 &src, const float * srca, const int count)
  324. {
  325. if (count<=0) return;
  326. int i;
  327. for (i=0; i<count; i++)
  328. {
  329. dst[i].X=src.X;
  330. dst[i].Y=src.Y;
  331. dst[i].Z=src.Z;
  332. dst[i].W=srca[i];
  333. }
  334. }
  335. void VectorProcessorClass::CopyIndexed (unsigned *dst,const unsigned *src, const unsigned int *index, int count)
  336. {
  337. if (count<=0) return;
  338. int i;
  339. for (i=0; i<count; i++)
  340. {
  341. dst[i]=src[index[i]];
  342. }
  343. }
  344. void VectorProcessorClass::CopyIndexed (Vector2 *dst,const Vector2 *src, const unsigned int *index, int count)
  345. {
  346. if (count<=0) return;
  347. int i;
  348. for (i=0; i<count; i++)
  349. {
  350. dst[i]=src[index[i]];
  351. }
  352. }
  353. void VectorProcessorClass::CopyIndexed (Vector3 *dst,const Vector3 *src, const unsigned int *index, int count)
  354. {
  355. if (count<=0) return;
  356. int i;
  357. for (i=0; i<count; i++)
  358. {
  359. dst[i]=src[index[i]];
  360. }
  361. }
  362. void VectorProcessorClass::CopyIndexed (Vector4 *dst,const Vector4 *src, const unsigned int *index, int count)
  363. {
  364. if (count<=0) return;
  365. int i;
  366. for (i=0; i<count; i++)
  367. {
  368. dst[i]=src[index[i]];
  369. }
  370. }
  371. void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)
  372. {
  373. if (count<=0) return;
  374. int i;
  375. for (i=0; i<count; i++)
  376. {
  377. dst[i]=src[index[i]];
  378. }
  379. }
  380. void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)
  381. {
  382. if (count<=0) return;
  383. int i;
  384. for (i=0; i<count; i++)
  385. {
  386. dst[i]=src[index[i]];
  387. }
  388. }
  389. void VectorProcessorClass::Clamp(Vector4 *dst,const Vector4 *src, const float min, const float max, const int count)
  390. {
  391. if (count<=0) return;
  392. int i;
  393. for (i=0; i<count; i++)
  394. {
  395. dst[i].X=(src[i].X<min)?min:src[i].X;
  396. dst[i].X=(src[i].X>max)?max:src[i].X;
  397. dst[i].Y=(src[i].Y<min)?min:src[i].Y;
  398. dst[i].Y=(src[i].Y>max)?max:src[i].Y;
  399. dst[i].Z=(src[i].Z<min)?min:src[i].Z;
  400. dst[i].Z=(src[i].Z>max)?max:src[i].Z;
  401. dst[i].W=(src[i].W<min)?min:src[i].W;
  402. dst[i].W=(src[i].W>max)?max:src[i].W;
  403. }
  404. }
  405. void VectorProcessorClass::Clear(Vector3*dst, const int count)
  406. {
  407. if (count<=0) return;
  408. memset(dst,0,sizeof(Vector3)*count);
  409. }
  410. void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
  411. {
  412. if (count<=0) return;
  413. int i;
  414. for (i=0; i<count; i++)
  415. dst[i].Normalize();
  416. }
  417. void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)
  418. {
  419. if (count<=0) return;
  420. min=*src;
  421. max=*src;
  422. int i;
  423. for (i=1; i<count; i++)
  424. {
  425. min.X=MIN(min.X,src[i].X);
  426. min.Y=MIN(min.Y,src[i].Y);
  427. min.Z=MIN(min.Z,src[i].Z);
  428. max.X=MAX(max.X,src[i].X);
  429. max.Y=MAX(max.Y,src[i].Y);
  430. max.Z=MAX(max.Z,src[i].Z);
  431. }
  432. }
  433. void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)
  434. {
  435. for (int i=0; i<count; i++) {
  436. dest[i] = dest[i] * multiplier + add;
  437. }
  438. }
  439. void VectorProcessorClass::DotProduct(float *dst, const Vector3 &a, const Vector3 *b,const int count)
  440. {
  441. for (int i=0; i<count; i++)
  442. dst[i]=Vector3::Dot_Product(a,b[i]);
  443. }
  444. void VectorProcessorClass::ClampMin(float *dst, float *src, const float min, const int count)
  445. {
  446. for (int i=0; i<count; i++)
  447. dst[i]=(src[i]>min?src[i]:min);
  448. }
  449. void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
  450. {
  451. for (int i=0; i<count; i++)
  452. dst[i]=powf(src[i],pow);
  453. }