vp.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. /*
  2. ** Command & Conquer Renegade(tm)
  3. ** Copyright 2025 Electronic Arts Inc.
  4. **
  5. ** This program is free software: you can redistribute it and/or modify
  6. ** it under the terms of the GNU General Public License as published by
  7. ** the Free Software Foundation, either version 3 of the License, or
  8. ** (at your option) any later version.
  9. **
  10. ** This program is distributed in the hope that it will be useful,
  11. ** but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ** GNU General Public License for more details.
  14. **
  15. ** You should have received a copy of the GNU General Public License
  16. ** along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. /***********************************************************************************************
  19. *** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S ***
  20. ***********************************************************************************************
  21. * *
  22. * Project Name : wwmath *
  23. * *
  24. * $Archive:: /Commando/Code/WWMath/vp.cpp $*
  25. * *
  26. * Author:: Hector Yee *
  27. * *
  28. * $Modtime:: 6/27/01 4:16p $*
  29. * *
  30. * $Revision:: 11 $*
  31. * *
  32. *---------------------------------------------------------------------------------------------*/
  33. #include "vp.h"
  34. #include "vector2.h"
  35. #include "vector3.h"
  36. #include "vector4.h"
  37. #include "matrix3d.h"
  38. #include "matrix4.h"
  39. #include "wwdebug.h"
  40. #include "cpudetect.h"
  41. #include <memory.h>
  42. #define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
  43. #define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))
  44. #define TRANSPOSE(BX, BY, BZ, BW, TV) \
  45. __asm movaps TV,BZ \
  46. __asm unpcklps BZ,BW \
  47. __asm unpckhps TV,BW \
  48. __asm movaps BW,BX \
  49. __asm unpcklps BX,BY \
  50. __asm unpckhps BW,BY \
  51. __asm movaps BY,BX \
  52. __asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \
  53. __asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \
  54. __asm movaps BZ,BW \
  55. __asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \
  56. __asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)
  57. void VectorProcessorClass::Prefetch(void* address)
  58. {
  59. #if defined (__ICL) // Detect Intel compiler
  60. if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
  61. __asm {
  62. // mov edx,address
  63. // mov eax,[edx]
  64. // prefetchT1 address
  65. }
  66. }
  67. #endif
  68. }
  69. static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
  70. void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
  71. {
  72. if (count<=0) return;
  73. #if defined (__ICL) // Detect Intel compiler
  74. if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
  75. __asm {
  76. mov edx,dst
  77. mov eax,src
  78. mov ebx,mtx
  79. mov edi,count
  80. movups xmm4,[ebx+0]
  81. movups xmm5,[ebx+16]
  82. movups xmm6,[ebx+32]
  83. movups xmm7,lastrow //[ebx+48]
  84. TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);
  85. shufps xmm4,xmm4,SHUFFLE(2,1,0,0)
  86. shufps xmm5,xmm5,SHUFFLE(2,1,0,0)
  87. shufps xmm6,xmm6,SHUFFLE(2,1,0,0)
  88. shufps xmm7,xmm7,SHUFFLE(2,1,0,0)
  89. mov esi,edx
  90. _lp:
  91. test edi,edi
  92. jz _ulos
  93. test esi,0xf
  94. jz _aligned
  95. movss xmm0,[eax]
  96. movss xmm1,[eax+4]
  97. movss xmm2,[eax+8]
  98. BROADCAST(xmm0,0)
  99. BROADCAST(xmm1,0)
  100. BROADCAST(xmm2,0)
  101. mulps xmm0,xmm4
  102. mulps xmm1,xmm5
  103. mulps xmm2,xmm6
  104. addps xmm0,xmm1
  105. addps xmm0,xmm2
  106. addps xmm0,xmm7
  107. movss [edx],xmm0
  108. movhps [edx+4],xmm0
  109. add eax,12
  110. add edx,12
  111. add esi,12
  112. dec edi
  113. jmp _lp
  114. _aligned:
  115. mov esi,1
  116. mov ecx,edi
  117. and edi,3
  118. and ecx,~3
  119. jz _lp
  120. lea ecx,[ecx+ecx*2]
  121. shl ecx,2
  122. add eax,ecx
  123. add edx,ecx
  124. neg ecx
  125. cmp dword ptr [ebx+12],0
  126. jne _xlatelp
  127. cmp dword ptr [ebx+28],0
  128. jne _xlatelp
  129. cmp dword ptr [ebx+44],0
  130. jne _xlatelp
  131. jmp _noxlatelp
  132. align 16
  133. _noxlatelp:
  134. prefetchnta [eax+ecx+48]
  135. prefetchnta [eax+ecx+48+32]
  136. movss xmm0,[eax+ecx]
  137. BROADCAST(xmm0,0)
  138. movss xmm1,[eax+ecx+4]
  139. BROADCAST(xmm1,0)
  140. movss xmm2,[eax+ecx+8]
  141. BROADCAST(xmm2,0)
  142. mulps xmm0,xmm4
  143. mulps xmm1,xmm5
  144. mulps xmm2,xmm6
  145. addps xmm0,xmm1
  146. addps xmm0,xmm2
  147. movss xmm1,[eax+ecx+12]
  148. BROADCAST(xmm1,0)
  149. movss xmm2,[eax+ecx+16]
  150. BROADCAST(xmm2,0)
  151. movss xmm3,[eax+ecx+20]
  152. BROADCAST(xmm3,0)
  153. mulps xmm1,xmm4
  154. mulps xmm2,xmm5
  155. mulps xmm3,xmm6
  156. addps xmm1,xmm2
  157. addps xmm3,xmm1
  158. movss xmm0,xmm3
  159. shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
  160. movaps [edx+ecx],xmm0
  161. prefetcht0 [edx+ecx+48]
  162. prefetcht0 [edx+ecx+48+32]
  163. movss xmm0,[eax+ecx+24]
  164. BROADCAST(xmm0,0)
  165. movss xmm1,[eax+ecx+24+4]
  166. BROADCAST(xmm1,0)
  167. movss xmm2,[eax+ecx+24+8]
  168. BROADCAST(xmm2,0)
  169. mulps xmm0,xmm4
  170. mulps xmm1,xmm5
  171. mulps xmm2,xmm6
  172. addps xmm0,xmm1
  173. addps xmm0,xmm2
  174. shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
  175. movaps [edx+ecx+16],xmm3
  176. movss xmm1,[eax+ecx+24+12]
  177. BROADCAST(xmm1,0)
  178. movss xmm2,[eax+ecx+24+16]
  179. BROADCAST(xmm2,0)
  180. movss xmm3,[eax+ecx+24+20]
  181. BROADCAST(xmm3,0)
  182. mulps xmm1,xmm4
  183. mulps xmm2,xmm5
  184. mulps xmm3,xmm6
  185. addps xmm1,xmm2
  186. addps xmm1,xmm3
  187. shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
  188. movss xmm1,xmm0
  189. movaps [edx+ecx+32],xmm1
  190. add ecx,48
  191. js _noxlatelp
  192. jmp _lp
  193. align 16
  194. _xlatelp:
  195. prefetchnta [eax+ecx+48]
  196. prefetchnta [eax+ecx+48+32]
  197. movss xmm0,[eax+ecx]
  198. BROADCAST(xmm0,0)
  199. movss xmm1,[eax+ecx+4]
  200. BROADCAST(xmm1,0)
  201. movss xmm2,[eax+ecx+8]
  202. BROADCAST(xmm2,0)
  203. mulps xmm0,xmm4
  204. mulps xmm1,xmm5
  205. mulps xmm2,xmm6
  206. addps xmm0,xmm1
  207. addps xmm0,xmm2
  208. addps xmm0,xmm7
  209. movss xmm1,[eax+ecx+12]
  210. BROADCAST(xmm1,0)
  211. movss xmm2,[eax+ecx+16]
  212. BROADCAST(xmm2,0)
  213. movss xmm3,[eax+ecx+20]
  214. BROADCAST(xmm3,0)
  215. mulps xmm1,xmm4
  216. mulps xmm2,xmm5
  217. mulps xmm3,xmm6
  218. addps xmm1,xmm2
  219. addps xmm3,xmm1
  220. addps xmm3,xmm7
  221. movss xmm0,xmm3
  222. shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
  223. movaps [edx+ecx],xmm0
  224. prefetcht0 [edx+ecx+48]
  225. prefetcht0 [edx+ecx+48+32]
  226. movss xmm0,[eax+ecx+24]
  227. BROADCAST(xmm0,0)
  228. movss xmm1,[eax+ecx+24+4]
  229. BROADCAST(xmm1,0)
  230. movss xmm2,[eax+ecx+24+8]
  231. BROADCAST(xmm2,0)
  232. mulps xmm0,xmm4
  233. mulps xmm1,xmm5
  234. mulps xmm2,xmm6
  235. addps xmm0,xmm1
  236. addps xmm0,xmm2
  237. addps xmm0,xmm7
  238. shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
  239. movaps [edx+ecx+16],xmm3
  240. movss xmm1,[eax+ecx+24+12]
  241. BROADCAST(xmm1,0)
  242. movss xmm2,[eax+ecx+24+16]
  243. BROADCAST(xmm2,0)
  244. movss xmm3,[eax+ecx+24+20]
  245. BROADCAST(xmm3,0)
  246. mulps xmm1,xmm4
  247. mulps xmm2,xmm5
  248. mulps xmm3,xmm6
  249. addps xmm1,xmm2
  250. addps xmm1,xmm3
  251. addps xmm1,xmm7
  252. shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
  253. movss xmm1,xmm0
  254. movaps [edx+ecx+32],xmm1
  255. add ecx,48
  256. js _xlatelp
  257. jmp _lp
  258. _ulos:
  259. }
  260. }
  261. else
  262. #endif
  263. {
  264. int i;
  265. for (i=0; i<count; i++)
  266. {
  267. dst[i]=mtx*src[i];
  268. }
  269. }
  270. }
  271. void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4& matrix, const int count)
  272. {
  273. if (count<=0) return;
  274. int i;
  275. for (i=0; i<count; i++)
  276. {
  277. dst[i]=matrix*src[i];
  278. }
  279. }
  280. void VectorProcessorClass::Copy(Vector2 *dst, const Vector2 *src, int count)
  281. {
  282. if (count<=0) return;
  283. memcpy(dst,src,sizeof(Vector2)*count);
  284. }
  285. void VectorProcessorClass::Copy(unsigned *dst, const unsigned *src, int count)
  286. {
  287. if (count<=0) return;
  288. memcpy(dst,src,sizeof(unsigned)*count);
  289. }
  290. void VectorProcessorClass::Copy(Vector3 *dst, const Vector3 *src, int count)
  291. {
  292. if (count<=0) return;
  293. memcpy(dst,src,sizeof(Vector3)*count);
  294. }
  295. void VectorProcessorClass::Copy(Vector4 *dst, const Vector4 *src, int count)
  296. {
  297. if (count<=0) return;
  298. memcpy(dst,src,sizeof(Vector4)*count);
  299. }
  300. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float * srca, const int count)
  301. {
  302. if (count<=0) return;
  303. int i;
  304. for (i=0; i<count; i++)
  305. {
  306. dst[i].X=src[i].X;
  307. dst[i].Y=src[i].Y;
  308. dst[i].Z=src[i].Z;
  309. dst[i].W=srca[i];
  310. }
  311. }
  312. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float srca, const int count)
  313. {
  314. if (count<=0) return;
  315. int i;
  316. for (i=0; i<count; i++)
  317. {
  318. dst[i].X=src[i].X;
  319. dst[i].Y=src[i].Y;
  320. dst[i].Z=src[i].Z;
  321. dst[i].W=srca;
  322. }
  323. }
  324. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 &src, const float * srca, const int count)
  325. {
  326. if (count<=0) return;
  327. int i;
  328. for (i=0; i<count; i++)
  329. {
  330. dst[i].X=src.X;
  331. dst[i].Y=src.Y;
  332. dst[i].Z=src.Z;
  333. dst[i].W=srca[i];
  334. }
  335. }
  336. void VectorProcessorClass::CopyIndexed (unsigned *dst,const unsigned *src, const unsigned int *index, int count)
  337. {
  338. if (count<=0) return;
  339. int i;
  340. for (i=0; i<count; i++)
  341. {
  342. dst[i]=src[index[i]];
  343. }
  344. }
  345. void VectorProcessorClass::CopyIndexed (Vector2 *dst,const Vector2 *src, const unsigned int *index, int count)
  346. {
  347. if (count<=0) return;
  348. int i;
  349. for (i=0; i<count; i++)
  350. {
  351. dst[i]=src[index[i]];
  352. }
  353. }
  354. void VectorProcessorClass::CopyIndexed (Vector3 *dst,const Vector3 *src, const unsigned int *index, int count)
  355. {
  356. if (count<=0) return;
  357. int i;
  358. for (i=0; i<count; i++)
  359. {
  360. dst[i]=src[index[i]];
  361. }
  362. }
  363. void VectorProcessorClass::CopyIndexed (Vector4 *dst,const Vector4 *src, const unsigned int *index, int count)
  364. {
  365. if (count<=0) return;
  366. int i;
  367. for (i=0; i<count; i++)
  368. {
  369. dst[i]=src[index[i]];
  370. }
  371. }
  372. void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)
  373. {
  374. if (count<=0) return;
  375. int i;
  376. for (i=0; i<count; i++)
  377. {
  378. dst[i]=src[index[i]];
  379. }
  380. }
  381. void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)
  382. {
  383. if (count<=0) return;
  384. int i;
  385. for (i=0; i<count; i++)
  386. {
  387. dst[i]=src[index[i]];
  388. }
  389. }
  390. void VectorProcessorClass::Clamp(Vector4 *dst,const Vector4 *src, const float min, const float max, const int count)
  391. {
  392. if (count<=0) return;
  393. int i;
  394. for (i=0; i<count; i++)
  395. {
  396. dst[i].X=(src[i].X<min)?min:src[i].X;
  397. dst[i].X=(src[i].X>max)?max:src[i].X;
  398. dst[i].Y=(src[i].Y<min)?min:src[i].Y;
  399. dst[i].Y=(src[i].Y>max)?max:src[i].Y;
  400. dst[i].Z=(src[i].Z<min)?min:src[i].Z;
  401. dst[i].Z=(src[i].Z>max)?max:src[i].Z;
  402. dst[i].W=(src[i].W<min)?min:src[i].W;
  403. dst[i].W=(src[i].W>max)?max:src[i].W;
  404. }
  405. }
  406. void VectorProcessorClass::Clear(Vector3*dst, const int count)
  407. {
  408. if (count<=0) return;
  409. memset(dst,0,sizeof(Vector3)*count);
  410. }
  411. void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
  412. {
  413. if (count<=0) return;
  414. int i;
  415. for (i=0; i<count; i++)
  416. dst[i].Normalize();
  417. }
  418. void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)
  419. {
  420. if (count<=0) return;
  421. min=*src;
  422. max=*src;
  423. int i;
  424. for (i=1; i<count; i++)
  425. {
  426. min.X=MIN(min.X,src[i].X);
  427. min.Y=MIN(min.Y,src[i].Y);
  428. min.Z=MIN(min.Z,src[i].Z);
  429. max.X=MAX(max.X,src[i].X);
  430. max.Y=MAX(max.Y,src[i].Y);
  431. max.Z=MAX(max.Z,src[i].Z);
  432. }
  433. }
  434. void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)
  435. {
  436. for (int i=0; i<count; i++) {
  437. dest[i] = dest[i] * multiplier + add;
  438. }
  439. }
  440. void VectorProcessorClass::DotProduct(float *dst, const Vector3 &a, const Vector3 *b,const int count)
  441. {
  442. for (int i=0; i<count; i++)
  443. dst[i]=Vector3::Dot_Product(a,b[i]);
  444. }
  445. void VectorProcessorClass::ClampMin(float *dst, float *src, const float min, const int count)
  446. {
  447. for (int i=0; i<count; i++)
  448. dst[i]=(src[i]>min?src[i]:min);
  449. }
  450. void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
  451. {
  452. for (int i=0; i<count; i++)
  453. dst[i]=powf(src[i],pow);
  454. }