platformCPUCount.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. // Original code is:
  2. // Copyright (c) 2005 Intel Corporation
  3. // All Rights Reserved
  4. //
  5. // CPUCount.cpp : Detects three forms of hardware multi-threading support across IA-32 platform
  6. // The three forms of HW multithreading are: Multi-processor, Multi-core, and
  7. // HyperThreading Technology.
  8. // This application enumerates all the logical processors enabled by OS and BIOS,
  9. // determine the HW topology of these enabled logical processors in the system
  10. // using information provided by CPUID instruction.
  11. // A multi-processing system can support any combination of the three forms of HW
  12. // multi-threading support. The relevant topology can be identified using a
  13. // three level decomposition of the "initial APIC ID" into
  14. // Package_id, core_id, and SMT_id. Such decomposition provides a three-level map of
  15. // the topology of hardware resources and
  16. // allow multi-threaded software to manage shared hardware resources in
  17. // the platform to reduce resource contention
  18. // Multicore detection algorithm for processor and cache topology requires
  19. // all leaf functions of CPUID instructions be available. System administrator
  20. // must ensure BIOS settings is not configured to restrict CPUID functionalities.
  21. //-------------------------------------------------------------------------------------------------
  22. #if defined(TORQUE_OS_LINUX) || defined(LINUX)
  23. // TODO GCC code don't compile on Release with optimizations, mover code to platform layer
  24. #else
  25. #include "platform/platform.h"
  26. #include "platform/platformCPUCount.h"
  27. #if defined(TORQUE_OS_LINUX) || defined(TORQUE_OS_OSX)
  28. #ifdef TORQUE_OS_LINUX
  29. // The Linux source code listing can be compiled using Linux kernel verison 2.6
  30. // or higher (e.g. RH 4AS-2.8 using GCC 3.4.4).
  31. // Due to syntax variances of Linux affinity APIs with earlier kernel versions
  32. // and dependence on glibc library versions, compilation on Linux environment
  33. // with older kernels and compilers may require kernel patches or compiler upgrades.
  34. #include <stdlib.h>
  35. #include <unistd.h>
  36. #include <string.h>
  37. #include <sched.h>
  38. #define DWORD unsigned long
  39. #elif defined( TORQUE_OS_WIN )
  40. #include <windows.h>
  41. #elif defined( TORQUE_OS_MAC )
  42. # include <sys/types.h>
  43. # include <sys/sysctl.h>
  44. #else
  45. #error Not implemented on platform.
  46. #endif
  47. #include <stdio.h>
  48. #include <assert.h>
  49. namespace CPUInfo {
  50. #define HWD_MT_BIT 0x10000000 // EDX[28] Bit 28 is set if HT or multi-core is supported
  51. #define NUM_LOGICAL_BITS 0x00FF0000 // EBX[23:16] Bit 16-23 in ebx contains the number of logical
  52. // processors per physical processor when execute cpuid with
  53. // eax set to 1
  54. #define NUM_CORE_BITS 0xFC000000 // EAX[31:26] Bit 26-31 in eax contains the number of cores minus one
  55. // per physical processor when execute cpuid with
  56. // eax set to 4.
  57. #define INITIAL_APIC_ID_BITS 0xFF000000 // EBX[31:24] Bits 24-31 (8 bits) return the 8-bit unique
  58. // initial APIC ID for the processor this code is running on.
  59. #ifndef TORQUE_OS_MAC
  60. static U32 CpuIDSupported(void);
  61. static U32 find_maskwidth(unsigned int);
  62. static U32 HWD_MTSupported(void);
  63. static U32 MaxLogicalProcPerPhysicalProc(void);
  64. static U32 MaxCorePerPhysicalProc(void);
  65. static U8 GetAPIC_ID(void);
  66. static U8 GetNzbSubID(U8, U8, U8);
  67. #endif
  68. static char g_s3Levels[2048];
  69. #ifndef TORQUE_OS_MAC
  70. //
  71. // CpuIDSupported will return 0 if CPUID instruction is unavailable. Otherwise, it will return
  72. // the maximum supported standard function.
  73. //
  74. static U32 CpuIDSupported(void)
  75. {
  76. U32 maxInputValue = 0;
  77. // If CPUID instruction is supported
  78. #ifdef TORQUE_COMPILER_GCC
  79. try
  80. {
  81. // call cpuid with eax = 0
  82. asm
  83. (
  84. "pushl %%ebx\n\t"
  85. "xorl %%eax,%%eax\n\t"
  86. "cpuid\n\t"
  87. "popl %%ebx\n\t"
  88. : "=a" (maxInputValue)
  89. :
  90. : "%ecx", "%edx"
  91. );
  92. }
  93. catch (...)
  94. {
  95. return(0); // cpuid instruction is unavailable
  96. }
  97. #elif defined( TORQUE_COMPILER_VISUALC )
  98. try
  99. {
  100. // call cpuid with eax = 0
  101. __asm
  102. {
  103. xor eax, eax
  104. cpuid
  105. mov maxInputValue, eax
  106. }
  107. }
  108. catch (...)
  109. {
  110. // cpuid instruction is unavailable
  111. }
  112. #else
  113. # error Not implemented.
  114. #endif
  115. return maxInputValue;
  116. }
  117. //
  118. // Function returns the maximum cores per physical package. Note that the number of
  119. // AVAILABLE cores per physical to be used by an application might be less than this
  120. // maximum value.
  121. //
  122. static U32 MaxCorePerPhysicalProc(void)
  123. {
  124. U32 Regeax = 0;
  125. if (!HWD_MTSupported()) return (U32) 1; // Single core
  126. #ifdef TORQUE_COMPILER_GCC
  127. {
  128. asm
  129. (
  130. "pushl %ebx\n\t"
  131. "xorl %eax, %eax\n\t"
  132. "cpuid\n\t"
  133. "cmpl $4, %eax\n\t" // check if cpuid supports leaf 4
  134. "jl .single_core\n\t" // Single core
  135. "movl $4, %eax\n\t"
  136. "movl $0, %ecx\n\t" // start with index = 0; Leaf 4 reports
  137. "popl %ebx\n\t"
  138. ); // at least one valid cache level
  139. asm
  140. (
  141. "cpuid"
  142. : "=a" (Regeax)
  143. :
  144. : "%ecx", "%edx"
  145. );
  146. asm
  147. (
  148. "jmp .multi_core\n"
  149. ".single_core:\n\t"
  150. "xor %eax, %eax\n"
  151. ".multi_core:"
  152. );
  153. }
  154. #elif defined( TORQUE_COMPILER_VISUALC )
  155. __asm
  156. {
  157. xor eax, eax
  158. cpuid
  159. cmp eax, 4 // check if cpuid supports leaf 4
  160. jl single_core // Single core
  161. mov eax, 4
  162. mov ecx, 0 // start with index = 0; Leaf 4 reports
  163. cpuid // at least one valid cache level
  164. mov Regeax, eax
  165. jmp multi_core
  166. single_core:
  167. xor eax, eax
  168. multi_core:
  169. }
  170. #else
  171. # error Not implemented.
  172. #endif
  173. return (U32)((Regeax & NUM_CORE_BITS) >> 26)+1;
  174. }
  175. //
  176. // The function returns 0 when the hardware multi-threaded bit is not set.
  177. //
  178. static U32 HWD_MTSupported(void)
  179. {
  180. U32 Regedx = 0;
  181. if ((CpuIDSupported() >= 1))
  182. {
  183. #ifdef TORQUE_COMPILER_GCC
  184. asm
  185. (
  186. "pushl %%ebx\n\t"
  187. "movl $1,%%eax\n\t"
  188. "cpuid\n\t"
  189. "popl %%ebx\n\t"
  190. : "=d" (Regedx)
  191. :
  192. : "%eax","%ecx"
  193. );
  194. #elif defined( TORQUE_COMPILER_VISUALC )
  195. __asm
  196. {
  197. mov eax, 1
  198. cpuid
  199. mov Regedx, edx
  200. }
  201. #else
  202. # error Not implemented.
  203. #endif
  204. }
  205. return (Regedx & HWD_MT_BIT);
  206. }
  207. //
  208. // Function returns the maximum logical processors per physical package. Note that the number of
  209. // AVAILABLE logical processors per physical to be used by an application might be less than this
  210. // maximum value.
  211. //
  212. static U32 MaxLogicalProcPerPhysicalProc(void)
  213. {
  214. U32 Regebx = 0;
  215. if (!HWD_MTSupported()) return (U32) 1;
  216. #ifdef TORQUE_COMPILER_GCC
  217. asm
  218. (
  219. "movl $1,%%eax\n\t"
  220. "cpuid"
  221. : "=b" (Regebx)
  222. :
  223. : "%eax","%ecx","%edx"
  224. );
  225. #elif defined( TORQUE_COMPILER_VISUALC )
  226. __asm
  227. {
  228. mov eax, 1
  229. cpuid
  230. mov Regebx, ebx
  231. }
  232. #else
  233. # error Not implemented.
  234. #endif
  235. return (unsigned int) ((Regebx & NUM_LOGICAL_BITS) >> 16);
  236. }
  237. static U8 GetAPIC_ID(void)
  238. {
  239. U32 Regebx = 0;
  240. #ifdef TORQUE_COMPILER_GCC
  241. asm
  242. (
  243. "movl $1, %%eax\n\t"
  244. "cpuid"
  245. : "=b" (Regebx)
  246. :
  247. : "%eax","%ecx","%edx"
  248. );
  249. #elif defined( TORQUE_COMPILER_VISUALC )
  250. __asm
  251. {
  252. mov eax, 1
  253. cpuid
  254. mov Regebx, ebx
  255. }
  256. #else
  257. # error Not implemented.
  258. #endif
  259. return (unsigned char) ((Regebx & INITIAL_APIC_ID_BITS) >> 24);
  260. }
  261. //
  262. // Determine the width of the bit field that can represent the value count_item.
  263. //
  264. U32 find_maskwidth(U32 CountItem)
  265. {
  266. U32 MaskWidth,
  267. count = CountItem;
  268. #ifdef TORQUE_COMPILER_GCC
  269. asm
  270. (
  271. #ifdef __x86_64__ // define constant to compile
  272. "push %%rcx\n\t" // under 64-bit Linux
  273. "push %%rax\n\t"
  274. #else
  275. "pushl %%ecx\n\t"
  276. "pushl %%eax\n\t"
  277. #endif
  278. // "movl $count, %%eax\n\t" //done by Assembler below
  279. "xorl %%ecx, %%ecx"
  280. // "movl %%ecx, MaskWidth\n\t" //done by Assembler below
  281. : "=c" (MaskWidth)
  282. : "a" (count)
  283. // : "%ecx", "%eax" We don't list these as clobbered because we don't want the assembler
  284. //to put them back when we are done
  285. );
  286. asm
  287. (
  288. "decl %%eax\n\t"
  289. "bsrw %%ax,%%cx\n\t"
  290. "jz next\n\t"
  291. "incw %%cx\n\t"
  292. // "movl %%ecx, MaskWidth\n" //done by Assembler below
  293. : "=c" (MaskWidth)
  294. :
  295. );
  296. asm
  297. (
  298. "next:\n\t"
  299. #ifdef __x86_64__
  300. "pop %rax\n\t"
  301. "pop %rcx"
  302. #else
  303. "popl %eax\n\t"
  304. "popl %ecx"
  305. #endif
  306. );
  307. #elif defined( TORQUE_COMPILER_VISUALC )
  308. __asm
  309. {
  310. mov eax, count
  311. mov ecx, 0
  312. mov MaskWidth, ecx
  313. dec eax
  314. bsr cx, ax
  315. jz next
  316. inc cx
  317. mov MaskWidth, ecx
  318. next:
  319. }
  320. #else
  321. # error Not implemented.
  322. #endif
  323. return MaskWidth;
  324. }
  325. //
  326. // Extract the subset of bit field from the 8-bit value FullID. It returns the 8-bit sub ID value
  327. //
  328. static U8 GetNzbSubID(U8 FullID,
  329. U8 MaxSubIDValue,
  330. U8 ShiftCount)
  331. {
  332. U32 MaskWidth;
  333. U8 MaskBits;
  334. MaskWidth = find_maskwidth((U32) MaxSubIDValue);
  335. MaskBits = (0xff << ShiftCount) ^
  336. ((U8) (0xff << (ShiftCount + MaskWidth)));
  337. return (FullID & MaskBits);
  338. }
  339. #endif
  340. //
  341. //
  342. //
  343. EConfig CPUCount(U32& TotAvailLogical, U32& TotAvailCore, U32& PhysicalNum)
  344. {
  345. EConfig StatusFlag = CONFIG_UserConfigIssue;
  346. g_s3Levels[0] = 0;
  347. TotAvailCore = 1;
  348. PhysicalNum = 1;
  349. U32 numLPEnabled = 0;
  350. S32 MaxLPPerCore = 1;
  351. #ifdef TORQUE_OS_MAC
  352. //FIXME: This isn't a proper port but more or less just some sneaky cheating
  353. // to get around having to mess with yet another crap UNIX-style API. Seems
  354. // like there isn't a way to do this that's working across all OSX incarnations
  355. // and machine configurations anyway.
  356. S32 numCPUs;
  357. S32 numPackages;
  358. // Get the number of CPUs.
  359. size_t len = sizeof( numCPUs );
  360. if( sysctlbyname( "hw.ncpu", &numCPUs, &len, 0, 0 ) == -1 )
  361. return CONFIG_UserConfigIssue;
  362. // Get the number of packages.
  363. len = sizeof( numPackages );
  364. if( sysctlbyname( "hw.packages", &numPackages, &len, 0, 0 ) == -1 )
  365. return CONFIG_UserConfigIssue;
  366. TotAvailCore = numCPUs;
  367. TotAvailLogical = numCPUs;
  368. PhysicalNum = numPackages;
  369. #else
  370. U32 dwAffinityMask;
  371. S32 j = 0;
  372. U8 apicID, PackageIDMask;
  373. U8 tblPkgID[256], tblCoreID[256], tblSMTID[256];
  374. char tmp[256];
  375. #ifdef TORQUE_OS_LINUX
  376. //we need to make sure that this process is allowed to run on
  377. //all of the logical processors that the OS itself can run on.
  378. //A process could acquire/inherit affinity settings that restricts the
  379. // current process to run on a subset of all logical processor visible to OS.
  380. // Linux doesn't easily allow us to look at the Affinity Bitmask directly,
  381. // but it does provide an API to test affinity maskbits of the current process
  382. // against each logical processor visible under OS.
  383. S32 sysNumProcs = sysconf(_SC_NPROCESSORS_CONF); //This will tell us how many
  384. //CPUs are currently enabled.
  385. //this will tell us which processors this process can run on.
  386. cpu_set_t allowedCPUs;
  387. sched_getaffinity(0, sizeof(allowedCPUs), &allowedCPUs);
  388. for (S32 i = 0; i < sysNumProcs; i++ )
  389. {
  390. if ( CPU_ISSET(i, &allowedCPUs) == 0 )
  391. return CONFIG_UserConfigIssue;
  392. }
  393. #elif defined( TORQUE_OS_WIN )
  394. DWORD dwProcessAffinity, dwSystemAffinity;
  395. GetProcessAffinityMask(GetCurrentProcess(),
  396. &dwProcessAffinity,
  397. &dwSystemAffinity);
  398. if (dwProcessAffinity != dwSystemAffinity) // not all CPUs are enabled
  399. return CONFIG_UserConfigIssue;
  400. #else
  401. # error Not implemented.
  402. #endif
  403. // Assume that cores within a package have the SAME number of
  404. // logical processors. Also, values returned by
  405. // MaxLogicalProcPerPhysicalProc and MaxCorePerPhysicalProc do not have
  406. // to be power of 2.
  407. MaxLPPerCore = MaxLogicalProcPerPhysicalProc() / MaxCorePerPhysicalProc();
  408. dwAffinityMask = 1;
  409. #ifdef TORQUE_OS_LINUX
  410. cpu_set_t currentCPU;
  411. while ( j < sysNumProcs )
  412. {
  413. CPU_ZERO(&currentCPU);
  414. CPU_SET(j, &currentCPU);
  415. if ( sched_setaffinity (0, sizeof(currentCPU), &currentCPU) == 0 )
  416. {
  417. sleep(0); // Ensure system to switch to the right CPU
  418. #elif defined( TORQUE_OS_WIN )
  419. while (dwAffinityMask && dwAffinityMask <= dwSystemAffinity)
  420. {
  421. if (SetThreadAffinityMask(GetCurrentThread(), dwAffinityMask))
  422. {
  423. Sleep(0); // Ensure system to switch to the right CPU
  424. #else
  425. # error Not implemented.
  426. #endif
  427. apicID = GetAPIC_ID();
  428. // Store SMT ID and core ID of each logical processor
  429. // Shift vlaue for SMT ID is 0
  430. // Shift value for core ID is the mask width for maximum logical
  431. // processors per core
  432. tblSMTID[j] = GetNzbSubID(apicID, MaxLPPerCore, 0);
  433. U8 maxCorePPP = MaxCorePerPhysicalProc();
  434. U8 maskWidth = find_maskwidth(MaxLPPerCore);
  435. tblCoreID[j] = GetNzbSubID(apicID, maxCorePPP, maskWidth);
  436. // Extract package ID, assume single cluster.
  437. // Shift value is the mask width for max Logical per package
  438. PackageIDMask = (unsigned char) (0xff <<
  439. find_maskwidth(MaxLogicalProcPerPhysicalProc()));
  440. tblPkgID[j] = apicID & PackageIDMask;
  441. sprintf(tmp," AffinityMask = %d; Initial APIC = %d; Physical ID = %d, Core ID = %d, SMT ID = %d\n",
  442. dwAffinityMask, apicID, tblPkgID[j], tblCoreID[j], tblSMTID[j]);
  443. dStrcat(g_s3Levels, tmp, 2048);
  444. numLPEnabled ++; // Number of available logical processors in the system.
  445. } // if
  446. j++;
  447. dwAffinityMask = 1 << j;
  448. } // while
  449. // restore the affinity setting to its original state
  450. #ifdef TORQUE_OS_LINUX
  451. sched_setaffinity (0, sizeof(allowedCPUs), &allowedCPUs);
  452. sleep(0);
  453. #elif defined( TORQUE_OS_WIN )
  454. SetThreadAffinityMask(GetCurrentThread(), dwProcessAffinity);
  455. Sleep(0);
  456. #else
  457. # error Not implemented.
  458. #endif
  459. TotAvailLogical = numLPEnabled;
  460. //
  461. // Count available cores (TotAvailCore) in the system
  462. //
  463. U8 CoreIDBucket[256];
  464. DWORD ProcessorMask, pCoreMask[256];
  465. U32 i, ProcessorNum;
  466. CoreIDBucket[0] = tblPkgID[0] | tblCoreID[0];
  467. ProcessorMask = 1;
  468. pCoreMask[0] = ProcessorMask;
  469. for (ProcessorNum = 1; ProcessorNum < numLPEnabled; ProcessorNum++)
  470. {
  471. ProcessorMask <<= 1;
  472. for (i = 0; i < TotAvailCore; i++)
  473. {
  474. // Comparing bit-fields of logical processors residing in different packages
  475. // Assuming the bit-masks are the same on all processors in the system.
  476. if ((tblPkgID[ProcessorNum] | tblCoreID[ProcessorNum]) == CoreIDBucket[i])
  477. {
  478. pCoreMask[i] |= ProcessorMask;
  479. break;
  480. }
  481. } // for i
  482. if (i == TotAvailCore) // did not match any bucket. Start a new one.
  483. {
  484. CoreIDBucket[i] = tblPkgID[ProcessorNum] | tblCoreID[ProcessorNum];
  485. pCoreMask[i] = ProcessorMask;
  486. TotAvailCore++; // Number of available cores in the system
  487. }
  488. } // for ProcessorNum
  489. //
  490. // Count physical processor (PhysicalNum) in the system
  491. //
  492. U8 PackageIDBucket[256];
  493. DWORD pPackageMask[256];
  494. PackageIDBucket[0] = tblPkgID[0];
  495. ProcessorMask = 1;
  496. pPackageMask[0] = ProcessorMask;
  497. for (ProcessorNum = 1; ProcessorNum < numLPEnabled; ProcessorNum++)
  498. {
  499. ProcessorMask <<= 1;
  500. for (i = 0; i < PhysicalNum; i++)
  501. {
  502. // Comparing bit-fields of logical processors residing in different packages
  503. // Assuming the bit-masks are the same on all processors in the system.
  504. if (tblPkgID[ProcessorNum]== PackageIDBucket[i])
  505. {
  506. pPackageMask[i] |= ProcessorMask;
  507. break;
  508. }
  509. } // for i
  510. if (i == PhysicalNum) // did not match any bucket. Start a new one.
  511. {
  512. PackageIDBucket[i] = tblPkgID[ProcessorNum];
  513. pPackageMask[i] = ProcessorMask;
  514. PhysicalNum++; // Total number of physical processors in the system
  515. }
  516. } // for ProcessorNum
  517. #endif
  518. //
  519. // Check to see if the system is multi-core
  520. // Check if the system is hyper-threading
  521. //
  522. if (TotAvailCore > PhysicalNum)
  523. {
  524. // Multi-core
  525. if (MaxLPPerCore == 1)
  526. StatusFlag = CONFIG_MultiCoreAndHTNotCapable;
  527. else if (numLPEnabled > TotAvailCore)
  528. StatusFlag = CONFIG_MultiCoreAndHTEnabled;
  529. else StatusFlag = CONFIG_MultiCoreAndHTDisabled;
  530. }
  531. else
  532. {
  533. // Single-core
  534. if (MaxLPPerCore == 1)
  535. StatusFlag = CONFIG_SingleCoreAndHTNotCapable;
  536. else if (numLPEnabled > TotAvailCore)
  537. StatusFlag = CONFIG_SingleCoreHTEnabled;
  538. else StatusFlag = CONFIG_SingleCoreHTDisabled;
  539. }
  540. return StatusFlag;
  541. }
  542. } // namespace CPUInfo
  543. #endif
  544. #endif