vp.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. /*
  2. ** Command & Conquer Generals(tm)
  3. ** Copyright 2025 Electronic Arts Inc.
  4. **
  5. ** This program is free software: you can redistribute it and/or modify
  6. ** it under the terms of the GNU General Public License as published by
  7. ** the Free Software Foundation, either version 3 of the License, or
  8. ** (at your option) any later version.
  9. **
  10. ** This program is distributed in the hope that it will be useful,
  11. ** but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ** GNU General Public License for more details.
  14. **
  15. ** You should have received a copy of the GNU General Public License
  16. ** along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. /***********************************************************************************************
  19. *** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S ***
  20. ***********************************************************************************************
  21. * *
  22. * Project Name : wwmath *
  23. * *
  24. * $Archive:: /Commando/Code/WWMath/vp.cpp $*
  25. * *
  26. * Author:: Hector Yee *
  27. * *
  28. * $Modtime:: 6/27/01 4:16p $*
  29. * *
  30. * $Revision:: 11 $*
  31. * *
  32. *---------------------------------------------------------------------------------------------*/
  33. #include "vp.h"
  34. #include "vector2.h"
  35. #include "vector3.h"
  36. #include "vector4.h"
  37. #include "matrix3d.h"
  38. #include "matrix4.h"
  39. #include "wwdebug.h"
  40. #include "cpudetect.h"
  41. #include <memory.h>
  42. #define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
  43. #define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))
  44. #define TRANSPOSE(BX, BY, BZ, BW, TV) \
  45. __asm movaps TV,BZ \
  46. __asm unpcklps BZ,BW \
  47. __asm unpckhps TV,BW \
  48. __asm movaps BW,BX \
  49. __asm unpcklps BX,BY \
  50. __asm unpckhps BW,BY \
  51. __asm movaps BY,BX \
  52. __asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \
  53. __asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \
  54. __asm movaps BZ,BW \
  55. __asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \
  56. __asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)
  57. void VectorProcessorClass::Prefetch(void* address)
  58. {
  59. #if defined (__ICL) // Detect Intel compiler
  60. if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
  61. __asm {
  62. // mov edx,address
  63. // mov eax,[edx]
  64. // prefetchT1 address
  65. }
  66. }
  67. #endif
  68. }
  69. static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
  70. void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
  71. {
  72. if (count<=0) return;
  73. #if defined (__ICL) // Detect Intel compiler
  74. if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
  75. __asm {
  76. mov edx,dst
  77. mov eax,src
  78. mov ebx,mtx
  79. mov edi,count
  80. movups xmm4,[ebx+0]
  81. movups xmm5,[ebx+16]
  82. movups xmm6,[ebx+32]
  83. movups xmm7,lastrow //[ebx+48]
  84. TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);
  85. shufps xmm4,xmm4,SHUFFLE(2,1,0,0)
  86. shufps xmm5,xmm5,SHUFFLE(2,1,0,0)
  87. shufps xmm6,xmm6,SHUFFLE(2,1,0,0)
  88. shufps xmm7,xmm7,SHUFFLE(2,1,0,0)
  89. mov esi,edx
  90. _lp:
  91. test edi,edi
  92. jz _ulos
  93. test esi,0xf
  94. jz _aligned
  95. movss xmm0,[eax]
  96. movss xmm1,[eax+4]
  97. movss xmm2,[eax+8]
  98. BROADCAST(xmm0,0)
  99. BROADCAST(xmm1,0)
  100. BROADCAST(xmm2,0)
  101. mulps xmm0,xmm4
  102. mulps xmm1,xmm5
  103. mulps xmm2,xmm6
  104. addps xmm0,xmm1
  105. addps xmm0,xmm2
  106. addps xmm0,xmm7
  107. movss [edx],xmm0
  108. movhps [edx+4],xmm0
  109. add eax,12
  110. add edx,12
  111. add esi,12
  112. dec edi
  113. jmp _lp
  114. _aligned:
  115. mov esi,1
  116. mov ecx,edi
  117. and edi,3
  118. and ecx,~3
  119. jz _lp
  120. lea ecx,[ecx+ecx*2]
  121. shl ecx,2
  122. add eax,ecx
  123. add edx,ecx
  124. neg ecx
  125. cmp dword ptr [ebx+12],0
  126. jne _xlatelp
  127. cmp dword ptr [ebx+28],0
  128. jne _xlatelp
  129. cmp dword ptr [ebx+44],0
  130. jne _xlatelp
  131. jmp _noxlatelp
  132. align 16
  133. _noxlatelp:
  134. prefetchnta [eax+ecx+48]
  135. prefetchnta [eax+ecx+48+32]
  136. movss xmm0,[eax+ecx]
  137. BROADCAST(xmm0,0)
  138. movss xmm1,[eax+ecx+4]
  139. BROADCAST(xmm1,0)
  140. movss xmm2,[eax+ecx+8]
  141. BROADCAST(xmm2,0)
  142. mulps xmm0,xmm4
  143. mulps xmm1,xmm5
  144. mulps xmm2,xmm6
  145. addps xmm0,xmm1
  146. addps xmm0,xmm2
  147. movss xmm1,[eax+ecx+12]
  148. BROADCAST(xmm1,0)
  149. movss xmm2,[eax+ecx+16]
  150. BROADCAST(xmm2,0)
  151. movss xmm3,[eax+ecx+20]
  152. BROADCAST(xmm3,0)
  153. mulps xmm1,xmm4
  154. mulps xmm2,xmm5
  155. mulps xmm3,xmm6
  156. addps xmm1,xmm2
  157. addps xmm3,xmm1
  158. movss xmm0,xmm3
  159. shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
  160. movaps [edx+ecx],xmm0
  161. prefetcht0 [edx+ecx+48]
  162. prefetcht0 [edx+ecx+48+32]
  163. movss xmm0,[eax+ecx+24]
  164. BROADCAST(xmm0,0)
  165. movss xmm1,[eax+ecx+24+4]
  166. BROADCAST(xmm1,0)
  167. movss xmm2,[eax+ecx+24+8]
  168. BROADCAST(xmm2,0)
  169. mulps xmm0,xmm4
  170. mulps xmm1,xmm5
  171. mulps xmm2,xmm6
  172. addps xmm0,xmm1
  173. addps xmm0,xmm2
  174. shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
  175. movaps [edx+ecx+16],xmm3
  176. movss xmm1,[eax+ecx+24+12]
  177. BROADCAST(xmm1,0)
  178. movss xmm2,[eax+ecx+24+16]
  179. BROADCAST(xmm2,0)
  180. movss xmm3,[eax+ecx+24+20]
  181. BROADCAST(xmm3,0)
  182. mulps xmm1,xmm4
  183. mulps xmm2,xmm5
  184. mulps xmm3,xmm6
  185. addps xmm1,xmm2
  186. addps xmm1,xmm3
  187. shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
  188. movss xmm1,xmm0
  189. movaps [edx+ecx+32],xmm1
  190. add ecx,48
  191. js _noxlatelp
  192. jmp _lp
  193. align 16
  194. _xlatelp:
  195. prefetchnta [eax+ecx+48]
  196. prefetchnta [eax+ecx+48+32]
  197. movss xmm0,[eax+ecx]
  198. BROADCAST(xmm0,0)
  199. movss xmm1,[eax+ecx+4]
  200. BROADCAST(xmm1,0)
  201. movss xmm2,[eax+ecx+8]
  202. BROADCAST(xmm2,0)
  203. mulps xmm0,xmm4
  204. mulps xmm1,xmm5
  205. mulps xmm2,xmm6
  206. addps xmm0,xmm1
  207. addps xmm0,xmm2
  208. addps xmm0,xmm7
  209. movss xmm1,[eax+ecx+12]
  210. BROADCAST(xmm1,0)
  211. movss xmm2,[eax+ecx+16]
  212. BROADCAST(xmm2,0)
  213. movss xmm3,[eax+ecx+20]
  214. BROADCAST(xmm3,0)
  215. mulps xmm1,xmm4
  216. mulps xmm2,xmm5
  217. mulps xmm3,xmm6
  218. addps xmm1,xmm2
  219. addps xmm3,xmm1
  220. addps xmm3,xmm7
  221. movss xmm0,xmm3
  222. shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
  223. movaps [edx+ecx],xmm0
  224. prefetcht0 [edx+ecx+48]
  225. prefetcht0 [edx+ecx+48+32]
  226. movss xmm0,[eax+ecx+24]
  227. BROADCAST(xmm0,0)
  228. movss xmm1,[eax+ecx+24+4]
  229. BROADCAST(xmm1,0)
  230. movss xmm2,[eax+ecx+24+8]
  231. BROADCAST(xmm2,0)
  232. mulps xmm0,xmm4
  233. mulps xmm1,xmm5
  234. mulps xmm2,xmm6
  235. addps xmm0,xmm1
  236. addps xmm0,xmm2
  237. addps xmm0,xmm7
  238. shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
  239. movaps [edx+ecx+16],xmm3
  240. movss xmm1,[eax+ecx+24+12]
  241. BROADCAST(xmm1,0)
  242. movss xmm2,[eax+ecx+24+16]
  243. BROADCAST(xmm2,0)
  244. movss xmm3,[eax+ecx+24+20]
  245. BROADCAST(xmm3,0)
  246. mulps xmm1,xmm4
  247. mulps xmm2,xmm5
  248. mulps xmm3,xmm6
  249. addps xmm1,xmm2
  250. addps xmm1,xmm3
  251. addps xmm1,xmm7
  252. shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
  253. movss xmm1,xmm0
  254. movaps [edx+ecx+32],xmm1
  255. add ecx,48
  256. js _xlatelp
  257. jmp _lp
  258. _ulos:
  259. }
  260. }
  261. else
  262. #endif
  263. {
  264. mtx.mulVector3Array(src, dst, count);
  265. }
  266. }
  267. void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4& matrix, const int count)
  268. {
  269. if (count<=0) return;
  270. int i;
  271. for (i=0; i<count; i++)
  272. {
  273. dst[i]=matrix*src[i];
  274. }
  275. }
  276. void VectorProcessorClass::Copy(Vector2 *dst, const Vector2 *src, int count)
  277. {
  278. if (count<=0) return;
  279. memcpy(dst,src,sizeof(Vector2)*count);
  280. }
  281. void VectorProcessorClass::Copy(unsigned *dst, const unsigned *src, int count)
  282. {
  283. if (count<=0) return;
  284. memcpy(dst,src,sizeof(unsigned)*count);
  285. }
  286. void VectorProcessorClass::Copy(Vector3 *dst, const Vector3 *src, int count)
  287. {
  288. if (count<=0) return;
  289. memcpy(dst,src,sizeof(Vector3)*count);
  290. }
  291. void VectorProcessorClass::Copy(Vector4 *dst, const Vector4 *src, int count)
  292. {
  293. if (count<=0) return;
  294. memcpy(dst,src,sizeof(Vector4)*count);
  295. }
  296. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float * srca, const int count)
  297. {
  298. if (count<=0) return;
  299. int i;
  300. for (i=0; i<count; i++)
  301. {
  302. dst[i].X=src[i].X;
  303. dst[i].Y=src[i].Y;
  304. dst[i].Z=src[i].Z;
  305. dst[i].W=srca[i];
  306. }
  307. }
  308. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float srca, const int count)
  309. {
  310. if (count<=0) return;
  311. int i;
  312. for (i=0; i<count; i++)
  313. {
  314. dst[i].X=src[i].X;
  315. dst[i].Y=src[i].Y;
  316. dst[i].Z=src[i].Z;
  317. dst[i].W=srca;
  318. }
  319. }
  320. void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 &src, const float * srca, const int count)
  321. {
  322. if (count<=0) return;
  323. int i;
  324. for (i=0; i<count; i++)
  325. {
  326. dst[i].X=src.X;
  327. dst[i].Y=src.Y;
  328. dst[i].Z=src.Z;
  329. dst[i].W=srca[i];
  330. }
  331. }
  332. void VectorProcessorClass::CopyIndexed (unsigned *dst,const unsigned *src, const unsigned int *index, int count)
  333. {
  334. if (count<=0) return;
  335. int i;
  336. for (i=0; i<count; i++)
  337. {
  338. dst[i]=src[index[i]];
  339. }
  340. }
  341. void VectorProcessorClass::CopyIndexed (Vector2 *dst,const Vector2 *src, const unsigned int *index, int count)
  342. {
  343. if (count<=0) return;
  344. int i;
  345. for (i=0; i<count; i++)
  346. {
  347. dst[i]=src[index[i]];
  348. }
  349. }
  350. void VectorProcessorClass::CopyIndexed (Vector3 *dst,const Vector3 *src, const unsigned int *index, int count)
  351. {
  352. if (count<=0) return;
  353. int i;
  354. for (i=0; i<count; i++)
  355. {
  356. dst[i]=src[index[i]];
  357. }
  358. }
  359. void VectorProcessorClass::CopyIndexed (Vector4 *dst,const Vector4 *src, const unsigned int *index, int count)
  360. {
  361. if (count<=0) return;
  362. int i;
  363. for (i=0; i<count; i++)
  364. {
  365. dst[i]=src[index[i]];
  366. }
  367. }
  368. void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)
  369. {
  370. if (count<=0) return;
  371. int i;
  372. for (i=0; i<count; i++)
  373. {
  374. dst[i]=src[index[i]];
  375. }
  376. }
  377. void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)
  378. {
  379. if (count<=0) return;
  380. int i;
  381. for (i=0; i<count; i++)
  382. {
  383. dst[i]=src[index[i]];
  384. }
  385. }
  386. void VectorProcessorClass::Clamp(Vector4 *dst,const Vector4 *src, const float min, const float max, const int count)
  387. {
  388. if (count<=0) return;
  389. int i;
  390. for (i=0; i<count; i++)
  391. {
  392. dst[i].X=(src[i].X<min)?min:src[i].X;
  393. dst[i].X=(src[i].X>max)?max:src[i].X;
  394. dst[i].Y=(src[i].Y<min)?min:src[i].Y;
  395. dst[i].Y=(src[i].Y>max)?max:src[i].Y;
  396. dst[i].Z=(src[i].Z<min)?min:src[i].Z;
  397. dst[i].Z=(src[i].Z>max)?max:src[i].Z;
  398. dst[i].W=(src[i].W<min)?min:src[i].W;
  399. dst[i].W=(src[i].W>max)?max:src[i].W;
  400. }
  401. }
  402. void VectorProcessorClass::Clear(Vector3*dst, const int count)
  403. {
  404. if (count<=0) return;
  405. memset(dst,0,sizeof(Vector3)*count);
  406. }
  407. void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
  408. {
  409. if (count<=0) return;
  410. int i;
  411. for (i=0; i<count; i++)
  412. dst[i].Normalize();
  413. }
  414. void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)
  415. {
  416. if (count<=0) return;
  417. min=*src;
  418. max=*src;
  419. int i;
  420. for (i=1; i<count; i++)
  421. {
  422. min.X=MIN(min.X,src[i].X);
  423. min.Y=MIN(min.Y,src[i].Y);
  424. min.Z=MIN(min.Z,src[i].Z);
  425. max.X=MAX(max.X,src[i].X);
  426. max.Y=MAX(max.Y,src[i].Y);
  427. max.Z=MAX(max.Z,src[i].Z);
  428. }
  429. }
  430. void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)
  431. {
  432. for (int i=0; i<count; i++) {
  433. dest[i] = dest[i] * multiplier + add;
  434. }
  435. }
  436. void VectorProcessorClass::DotProduct(float *dst, const Vector3 &a, const Vector3 *b,const int count)
  437. {
  438. for (int i=0; i<count; i++)
  439. dst[i]=Vector3::Dot_Product(a,b[i]);
  440. }
  441. void VectorProcessorClass::ClampMin(float *dst, float *src, const float min, const int count)
  442. {
  443. for (int i=0; i<count; i++)
  444. dst[i]=(src[i]>min?src[i]:min);
  445. }
  446. void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
  447. {
  448. for (int i=0; i<count; i++)
  449. dst[i]=powf(src[i],pow);
  450. }