CpuArch.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970
  1. /* CpuArch.c -- CPU specific code
  2. Igor Pavlov : Public domain */
  3. #include "Precomp.h"
  4. // #include <stdio.h>
  5. #include "CpuArch.h"
  6. #ifdef MY_CPU_X86_OR_AMD64
  7. #undef NEED_CHECK_FOR_CPUID
  8. #if !defined(MY_CPU_AMD64)
  9. #define NEED_CHECK_FOR_CPUID
  10. #endif
  11. /*
  12. cpuid instruction supports (subFunction) parameter in ECX,
  13. that is used only with some specific (function) parameter values.
  14. most functions use only (subFunction==0).
  15. */
  16. /*
  17. __cpuid(): MSVC and GCC/CLANG use same function/macro name
  18. but parameters are different.
  19. We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.
  20. */
  21. #if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \
  22. || defined(__clang__) /* && (__clang_major__ >= 10) */
  23. /* there was some CLANG/GCC compilers that have issues with
  24. rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).
  25. compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.
  26. The history of __cpuid() changes in CLANG/GCC:
  27. GCC:
  28. 2007: it preserved ebx for (__PIC__ && __i386__)
  29. 2013: it preserved rbx and ebx for __PIC__
  30. 2014: it doesn't preserves rbx and ebx anymore
  31. we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.
  32. CLANG:
  33. 2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.
  34. Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?
  35. Do we need __PIC__ test for CLANG or we must care about rbx even if
  36. __PIC__ is not defined?
  37. */
  38. #define ASM_LN "\n"
  39. #if defined(MY_CPU_AMD64) && defined(__PIC__) \
  40. && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
  41. /* "=&r" selects free register. It can select even rbx, if that register is free.
  42. "=&D" for (RDI) also works, but the code can be larger with "=&D"
  43. "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
  44. #define x86_cpuid_MACRO_2(p, func, subFunc) { \
  45. __asm__ __volatile__ ( \
  46. ASM_LN "mov %%rbx, %q1" \
  47. ASM_LN "cpuid" \
  48. ASM_LN "xchg %%rbx, %q1" \
  49. : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
  50. #elif defined(MY_CPU_X86) && defined(__PIC__) \
  51. && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
  52. #define x86_cpuid_MACRO_2(p, func, subFunc) { \
  53. __asm__ __volatile__ ( \
  54. ASM_LN "mov %%ebx, %k1" \
  55. ASM_LN "cpuid" \
  56. ASM_LN "xchg %%ebx, %k1" \
  57. : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
  58. #else
  59. #define x86_cpuid_MACRO_2(p, func, subFunc) { \
  60. __asm__ __volatile__ ( \
  61. ASM_LN "cpuid" \
  62. : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
  63. #endif
  64. #define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)
  65. void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  66. {
  67. x86_cpuid_MACRO(p, func)
  68. }
  69. static
  70. void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
  71. {
  72. x86_cpuid_MACRO_2(p, func, subFunc)
  73. }
  74. Z7_NO_INLINE
  75. UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
  76. {
  77. #if defined(NEED_CHECK_FOR_CPUID)
  78. #define EFALGS_CPUID_BIT 21
  79. UInt32 a;
  80. __asm__ __volatile__ (
  81. ASM_LN "pushf"
  82. ASM_LN "pushf"
  83. ASM_LN "pop %0"
  84. // ASM_LN "movl %0, %1"
  85. // ASM_LN "xorl $0x200000, %0"
  86. ASM_LN "btc %1, %0"
  87. ASM_LN "push %0"
  88. ASM_LN "popf"
  89. ASM_LN "pushf"
  90. ASM_LN "pop %0"
  91. ASM_LN "xorl (%%esp), %0"
  92. ASM_LN "popf"
  93. ASM_LN
  94. : "=&r" (a) // "=a"
  95. : "i" (EFALGS_CPUID_BIT)
  96. );
  97. if ((a & (1 << EFALGS_CPUID_BIT)) == 0)
  98. return 0;
  99. #endif
  100. {
  101. UInt32 p[4];
  102. x86_cpuid_MACRO(p, 0)
  103. return p[0];
  104. }
  105. }
  106. #undef ASM_LN
  107. #elif !defined(_MSC_VER)
  108. /*
  109. // for gcc/clang and other: we can try to use __cpuid macro:
  110. #include <cpuid.h>
  111. void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  112. {
  113. __cpuid(func, p[0], p[1], p[2], p[3]);
  114. }
  115. UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
  116. {
  117. return (UInt32)__get_cpuid_max(0, NULL);
  118. }
  119. */
  120. // for unsupported cpuid:
  121. void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  122. {
  123. UNUSED_VAR(func)
  124. p[0] = p[1] = p[2] = p[3] = 0;
  125. }
  126. UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
  127. {
  128. return 0;
  129. }
  130. #else // _MSC_VER
  131. #if !defined(MY_CPU_AMD64)
  132. UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
  133. {
  134. #if defined(NEED_CHECK_FOR_CPUID)
  135. #define EFALGS_CPUID_BIT 21
  136. __asm pushfd
  137. __asm pushfd
  138. /*
  139. __asm pop eax
  140. // __asm mov edx, eax
  141. __asm btc eax, EFALGS_CPUID_BIT
  142. __asm push eax
  143. */
  144. __asm btc dword ptr [esp], EFALGS_CPUID_BIT
  145. __asm popfd
  146. __asm pushfd
  147. __asm pop eax
  148. // __asm xor eax, edx
  149. __asm xor eax, [esp]
  150. // __asm push edx
  151. __asm popfd
  152. __asm and eax, (1 shl EFALGS_CPUID_BIT)
  153. __asm jz end_func
  154. #endif
  155. __asm push ebx
  156. __asm xor eax, eax // func
  157. __asm xor ecx, ecx // subFunction (optional) for (func == 0)
  158. __asm cpuid
  159. __asm pop ebx
  160. #if defined(NEED_CHECK_FOR_CPUID)
  161. end_func:
  162. #endif
  163. __asm ret 0
  164. }
  165. void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  166. {
  167. UNUSED_VAR(p)
  168. UNUSED_VAR(func)
  169. __asm push ebx
  170. __asm push edi
  171. __asm mov edi, ecx // p
  172. __asm mov eax, edx // func
  173. __asm xor ecx, ecx // subfunction (optional) for (func == 0)
  174. __asm cpuid
  175. __asm mov [edi ], eax
  176. __asm mov [edi + 4], ebx
  177. __asm mov [edi + 8], ecx
  178. __asm mov [edi + 12], edx
  179. __asm pop edi
  180. __asm pop ebx
  181. __asm ret 0
  182. }
  183. static
  184. void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
  185. {
  186. UNUSED_VAR(p)
  187. UNUSED_VAR(func)
  188. UNUSED_VAR(subFunc)
  189. __asm push ebx
  190. __asm push edi
  191. __asm mov edi, ecx // p
  192. __asm mov eax, edx // func
  193. __asm mov ecx, [esp + 12] // subFunc
  194. __asm cpuid
  195. __asm mov [edi ], eax
  196. __asm mov [edi + 4], ebx
  197. __asm mov [edi + 8], ecx
  198. __asm mov [edi + 12], edx
  199. __asm pop edi
  200. __asm pop ebx
  201. __asm ret 4
  202. }
  203. #else // MY_CPU_AMD64
  204. #if _MSC_VER >= 1600
  205. #include <intrin.h>
  206. #define MY_cpuidex __cpuidex
  207. static
  208. void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
  209. {
  210. __cpuidex((int *)p, func, subFunc);
  211. }
  212. #else
  213. /*
  214. __cpuid (func == (0 or 7)) requires subfunction number in ECX.
  215. MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.
  216. __cpuid() in new MSVC clears ECX.
  217. __cpuid() in old MSVC (14.00) x64 doesn't clear ECX
  218. We still can use __cpuid for low (func) values that don't require ECX,
  219. but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
  220. So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
  221. where ECX value is first parameter for FASTCALL / NO_INLINE func.
  222. So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
  223. old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
  224. DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
  225. */
  226. static
  227. Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo)
  228. {
  229. UNUSED_VAR(subFunction)
  230. __cpuid(CPUInfo, func);
  231. }
  232. #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
  233. #pragma message("======== MY_cpuidex_HACK WAS USED ========")
  234. static
  235. void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
  236. {
  237. MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
  238. }
  239. #endif // _MSC_VER >= 1600
  240. #if !defined(MY_CPU_AMD64)
  241. /* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,
  242. so we disable inlining here */
  243. Z7_NO_INLINE
  244. #endif
  245. void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  246. {
  247. MY_cpuidex((Int32 *)p, (Int32)func, 0);
  248. }
  249. Z7_NO_INLINE
  250. UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
  251. {
  252. Int32 a[4];
  253. MY_cpuidex(a, 0, 0);
  254. return a[0];
  255. }
  256. #endif // MY_CPU_AMD64
  257. #endif // _MSC_VER
  258. #if defined(NEED_CHECK_FOR_CPUID)
  259. #define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }
  260. #else
  261. #define CHECK_CPUID_IS_SUPPORTED
  262. #endif
  263. #undef NEED_CHECK_FOR_CPUID
  264. static
  265. BoolInt x86cpuid_Func_1(UInt32 *p)
  266. {
  267. CHECK_CPUID_IS_SUPPORTED
  268. z7_x86_cpuid(p, 1);
  269. return True;
  270. }
  271. /*
  272. static const UInt32 kVendors[][1] =
  273. {
  274. { 0x756E6547 }, // , 0x49656E69, 0x6C65746E },
  275. { 0x68747541 }, // , 0x69746E65, 0x444D4163 },
  276. { 0x746E6543 } // , 0x48727561, 0x736C7561 }
  277. };
  278. */
  279. /*
  280. typedef struct
  281. {
  282. UInt32 maxFunc;
  283. UInt32 vendor[3];
  284. UInt32 ver;
  285. UInt32 b;
  286. UInt32 c;
  287. UInt32 d;
  288. } Cx86cpuid;
  289. enum
  290. {
  291. CPU_FIRM_INTEL,
  292. CPU_FIRM_AMD,
  293. CPU_FIRM_VIA
  294. };
  295. int x86cpuid_GetFirm(const Cx86cpuid *p);
  296. #define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))
  297. #define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf))
  298. #define x86cpuid_ver_GetStepping(ver) (ver & 0xf)
  299. int x86cpuid_GetFirm(const Cx86cpuid *p)
  300. {
  301. unsigned i;
  302. for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)
  303. {
  304. const UInt32 *v = kVendors[i];
  305. if (v[0] == p->vendor[0]
  306. // && v[1] == p->vendor[1]
  307. // && v[2] == p->vendor[2]
  308. )
  309. return (int)i;
  310. }
  311. return -1;
  312. }
  313. BoolInt CPU_Is_InOrder()
  314. {
  315. Cx86cpuid p;
  316. UInt32 family, model;
  317. if (!x86cpuid_CheckAndRead(&p))
  318. return True;
  319. family = x86cpuid_ver_GetFamily(p.ver);
  320. model = x86cpuid_ver_GetModel(p.ver);
  321. switch (x86cpuid_GetFirm(&p))
  322. {
  323. case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (
  324. // In-Order Atom CPU
  325. model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330
  326. || model == 0x26 // 45 nm, Z6xx
  327. || model == 0x27 // 32 nm, Z2460
  328. || model == 0x35 // 32 nm, Z2760
  329. || model == 0x36 // 32 nm, N2xxx, D2xxx
  330. )));
  331. case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));
  332. case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));
  333. }
  334. return False; // v23 : unknown processors are not In-Order
  335. }
  336. */
  337. #ifdef _WIN32
  338. #include "7zWindows.h"
  339. #endif
  340. #if !defined(MY_CPU_AMD64) && defined(_WIN32)
  341. /* for legacy SSE ia32: there is no user-space cpu instruction to check
  342. that OS supports SSE register storing/restoring on context switches.
  343. So we need some OS-specific function to check that it's safe to use SSE registers.
  344. */
  345. Z7_FORCE_INLINE
  346. static BoolInt CPU_Sys_Is_SSE_Supported(void)
  347. {
  348. #ifdef _MSC_VER
  349. #pragma warning(push)
  350. #pragma warning(disable : 4996) // `GetVersion': was declared deprecated
  351. #endif
  352. /* low byte is major version of Windows
  353. We suppose that any Windows version since
  354. Windows2000 (major == 5) supports SSE registers */
  355. return (Byte)GetVersion() >= 5;
  356. #if defined(_MSC_VER)
  357. #pragma warning(pop)
  358. #endif
  359. }
  360. #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;
  361. #else
  362. #define CHECK_SYS_SSE_SUPPORT
  363. #endif
  364. #if !defined(MY_CPU_AMD64)
  365. BoolInt CPU_IsSupported_CMOV(void)
  366. {
  367. UInt32 a[4];
  368. if (!x86cpuid_Func_1(&a[0]))
  369. return 0;
  370. return (BoolInt)(a[3] >> 15) & 1;
  371. }
  372. BoolInt CPU_IsSupported_SSE(void)
  373. {
  374. UInt32 a[4];
  375. CHECK_SYS_SSE_SUPPORT
  376. if (!x86cpuid_Func_1(&a[0]))
  377. return 0;
  378. return (BoolInt)(a[3] >> 25) & 1;
  379. }
  380. BoolInt CPU_IsSupported_SSE2(void)
  381. {
  382. UInt32 a[4];
  383. CHECK_SYS_SSE_SUPPORT
  384. if (!x86cpuid_Func_1(&a[0]))
  385. return 0;
  386. return (BoolInt)(a[3] >> 26) & 1;
  387. }
  388. #endif
  389. static UInt32 x86cpuid_Func_1_ECX(void)
  390. {
  391. UInt32 a[4];
  392. CHECK_SYS_SSE_SUPPORT
  393. if (!x86cpuid_Func_1(&a[0]))
  394. return 0;
  395. return a[2];
  396. }
  397. BoolInt CPU_IsSupported_AES(void)
  398. {
  399. return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1;
  400. }
  401. BoolInt CPU_IsSupported_SSSE3(void)
  402. {
  403. return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1;
  404. }
  405. BoolInt CPU_IsSupported_SSE41(void)
  406. {
  407. return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1;
  408. }
  409. BoolInt CPU_IsSupported_SHA(void)
  410. {
  411. CHECK_SYS_SSE_SUPPORT
  412. if (z7_x86_cpuid_GetMaxFunc() < 7)
  413. return False;
  414. {
  415. UInt32 d[4];
  416. z7_x86_cpuid(d, 7);
  417. return (BoolInt)(d[1] >> 29) & 1;
  418. }
  419. }
  420. BoolInt CPU_IsSupported_SHA512(void)
  421. {
  422. if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
  423. if (z7_x86_cpuid_GetMaxFunc() < 7)
  424. return False;
  425. {
  426. UInt32 d[4];
  427. z7_x86_cpuid_subFunc(d, 7, 0);
  428. if (d[0] < 1) // d[0] - is max supported subleaf value
  429. return False;
  430. z7_x86_cpuid_subFunc(d, 7, 1);
  431. return (BoolInt)(d[0]) & 1;
  432. }
  433. }
  434. /*
  435. MSVC: _xgetbv() intrinsic is available since VS2010SP1.
  436. MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
  437. <immintrin.h> that we can use or check.
  438. For any 32-bit x86 we can use asm code in MSVC,
  439. but MSVC asm code is huge after compilation.
  440. So _xgetbv() is better
  441. ICC: _xgetbv() intrinsic is available (in what version of ICC?)
  442. ICC defines (__GNUC___) and it supports gnu assembler
  443. also ICC supports MASM style code with -use-msasm switch.
  444. but ICC doesn't support __attribute__((__target__))
  445. GCC/CLANG 9:
  446. _xgetbv() is macro that works via __builtin_ia32_xgetbv()
  447. and we need __attribute__((__target__("xsave")).
  448. But with __target__("xsave") the function will be not
  449. inlined to function that has no __target__("xsave") attribute.
  450. If we want _xgetbv() call inlining, then we should use asm version
  451. instead of calling _xgetbv().
  452. Note:intrinsic is broke before GCC 8.2:
  453. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684
  454. */
  455. #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \
  456. || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \
  457. || defined(__GNUC__) && (__GNUC__ >= 9) \
  458. || defined(__clang__) && (__clang_major__ >= 9)
  459. // we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler
  460. #if defined(__INTEL_COMPILER)
  461. #define ATTRIB_XGETBV
  462. #elif defined(__GNUC__) || defined(__clang__)
  463. // we don't define ATTRIB_XGETBV here, because asm version is better for inlining.
  464. // #define ATTRIB_XGETBV __attribute__((__target__("xsave")))
  465. #else
  466. #define ATTRIB_XGETBV
  467. #endif
  468. #endif
  469. #if defined(ATTRIB_XGETBV)
  470. #include <immintrin.h>
  471. #endif
  472. // XFEATURE_ENABLED_MASK/XCR0
  473. #define MY_XCR_XFEATURE_ENABLED_MASK 0
  474. #if defined(ATTRIB_XGETBV)
  475. ATTRIB_XGETBV
  476. #endif
  477. static UInt64 x86_xgetbv_0(UInt32 num)
  478. {
  479. #if defined(ATTRIB_XGETBV)
  480. {
  481. return
  482. #if (defined(_MSC_VER))
  483. _xgetbv(num);
  484. #else
  485. __builtin_ia32_xgetbv(
  486. #if !defined(__clang__)
  487. (int)
  488. #endif
  489. num);
  490. #endif
  491. }
  492. #elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)
  493. UInt32 a, d;
  494. #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
  495. __asm__
  496. (
  497. "xgetbv"
  498. : "=a"(a), "=d"(d) : "c"(num) : "cc"
  499. );
  500. #else // is old gcc
  501. __asm__
  502. (
  503. ".byte 0x0f, 0x01, 0xd0" "\n\t"
  504. : "=a"(a), "=d"(d) : "c"(num) : "cc"
  505. );
  506. #endif
  507. return ((UInt64)d << 32) | a;
  508. // return a;
  509. #elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)
  510. UInt32 a, d;
  511. __asm {
  512. push eax
  513. push edx
  514. push ecx
  515. mov ecx, num;
  516. // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK
  517. _emit 0x0f
  518. _emit 0x01
  519. _emit 0xd0
  520. mov a, eax
  521. mov d, edx
  522. pop ecx
  523. pop edx
  524. pop eax
  525. }
  526. return ((UInt64)d << 32) | a;
  527. // return a;
  528. #else // it's unknown compiler
  529. // #error "Need xgetbv function"
  530. UNUSED_VAR(num)
  531. // for MSVC-X64 we could call external function from external file.
  532. /* Actually we had checked OSXSAVE/AVX in cpuid before.
  533. So it's expected that OS supports at least AVX and below. */
  534. // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0
  535. return
  536. // (1 << 0) | // x87
  537. (1 << 1) // SSE
  538. | (1 << 2); // AVX
  539. #endif
  540. }
  541. #ifdef _WIN32
  542. /*
  543. Windows versions do not know about new ISA extensions that
  544. can be introduced. But we still can use new extensions,
  545. even if Windows doesn't report about supporting them,
  546. But we can use new extensions, only if Windows knows about new ISA extension
  547. that changes the number or size of registers: SSE, AVX/XSAVE, AVX512
  548. So it's enough to check
  549. MY_PF_AVX_INSTRUCTIONS_AVAILABLE
  550. instead of
  551. MY_PF_AVX2_INSTRUCTIONS_AVAILABLE
  552. */
  553. #define MY_PF_XSAVE_ENABLED 17
  554. // #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36
  555. // #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37
  556. // #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38
  557. // #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39
  558. // #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40
  559. // #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41
  560. #endif
  561. BoolInt CPU_IsSupported_AVX(void)
  562. {
  563. #ifdef _WIN32
  564. if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))
  565. return False;
  566. /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from
  567. some latest Win10 revisions. But we need AVX in older Windows also.
  568. So we don't use the following check: */
  569. /*
  570. if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))
  571. return False;
  572. */
  573. #endif
  574. /*
  575. OS must use new special XSAVE/XRSTOR instructions to save
  576. AVX registers when it required for context switching.
  577. At OS statring:
  578. OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.
  579. Also OS sets bitmask in XCR0 register that defines what
  580. registers will be processed by XSAVE instruction:
  581. XCR0.SSE[bit 0] - x87 registers and state
  582. XCR0.SSE[bit 1] - SSE registers and state
  583. XCR0.AVX[bit 2] - AVX registers and state
  584. CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].
  585. So we can read that bit in user-space.
  586. XCR0 is available for reading in user-space by new XGETBV instruction.
  587. */
  588. {
  589. const UInt32 c = x86cpuid_Func_1_ECX();
  590. if (0 == (1
  591. & (c >> 28) // AVX instructions are supported by hardware
  592. & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.
  593. return False;
  594. }
  595. /* also we can check
  596. CPUID.1:ECX.XSAVE [bit 26] : that shows that
  597. XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.
  598. But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */
  599. /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),
  600. in most cases we expect that OS also will support storing/restoring
  601. for AVX and SSE states at least.
  602. But to be ensure for that we call user-space instruction
  603. XGETBV(0) to get XCR0 value that contains bitmask that defines
  604. what exact states(registers) OS have enabled for storing/restoring.
  605. */
  606. {
  607. const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
  608. // printf("\n=== XGetBV=0x%x\n", bm);
  609. return 1
  610. & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring
  611. & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring
  612. }
  613. // since Win7SP1: we can use GetEnabledXStateFeatures();
  614. }
  615. BoolInt CPU_IsSupported_AVX2(void)
  616. {
  617. if (!CPU_IsSupported_AVX())
  618. return False;
  619. if (z7_x86_cpuid_GetMaxFunc() < 7)
  620. return False;
  621. {
  622. UInt32 d[4];
  623. z7_x86_cpuid(d, 7);
  624. // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
  625. return 1
  626. & (BoolInt)(d[1] >> 5); // avx2
  627. }
  628. }
  629. #if 0
  630. BoolInt CPU_IsSupported_AVX512F_AVX512VL(void)
  631. {
  632. if (!CPU_IsSupported_AVX())
  633. return False;
  634. if (z7_x86_cpuid_GetMaxFunc() < 7)
  635. return False;
  636. {
  637. UInt32 d[4];
  638. BoolInt v;
  639. z7_x86_cpuid(d, 7);
  640. // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
  641. v = 1
  642. & (BoolInt)(d[1] >> 16) // avx512f
  643. & (BoolInt)(d[1] >> 31); // avx512vl
  644. if (!v)
  645. return False;
  646. }
  647. {
  648. const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
  649. // printf("\n=== XGetBV=0x%x\n", bm);
  650. return 1
  651. & (BoolInt)(bm >> 5) // OPMASK
  652. & (BoolInt)(bm >> 6) // ZMM upper 256-bit
  653. & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31
  654. }
  655. }
  656. #endif
  657. BoolInt CPU_IsSupported_VAES_AVX2(void)
  658. {
  659. if (!CPU_IsSupported_AVX())
  660. return False;
  661. if (z7_x86_cpuid_GetMaxFunc() < 7)
  662. return False;
  663. {
  664. UInt32 d[4];
  665. z7_x86_cpuid(d, 7);
  666. // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
  667. return 1
  668. & (BoolInt)(d[1] >> 5) // avx2
  669. // & (d[1] >> 31) // avx512vl
  670. & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX
  671. }
  672. }
  673. BoolInt CPU_IsSupported_PageGB(void)
  674. {
  675. CHECK_CPUID_IS_SUPPORTED
  676. {
  677. UInt32 d[4];
  678. z7_x86_cpuid(d, 0x80000000);
  679. if (d[0] < 0x80000001)
  680. return False;
  681. z7_x86_cpuid(d, 0x80000001);
  682. return (BoolInt)(d[3] >> 26) & 1;
  683. }
  684. }
  685. #elif defined(MY_CPU_ARM_OR_ARM64)
  686. #ifdef _WIN32
  687. #include "7zWindows.h"
  688. BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
  689. BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
  690. BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
  691. #else
  692. #if defined(__APPLE__)
  693. /*
  694. #include <stdio.h>
  695. #include <string.h>
  696. static void Print_sysctlbyname(const char *name)
  697. {
  698. size_t bufSize = 256;
  699. char buf[256];
  700. int res = sysctlbyname(name, &buf, &bufSize, NULL, 0);
  701. {
  702. int i;
  703. printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize);
  704. for (i = 0; i < 20; i++)
  705. printf(" %2x", (unsigned)(Byte)buf[i]);
  706. }
  707. }
  708. */
  709. /*
  710. Print_sysctlbyname("hw.pagesize");
  711. Print_sysctlbyname("machdep.cpu.brand_string");
  712. */
  713. static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)
  714. {
  715. UInt32 val = 0;
  716. if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
  717. return 1;
  718. return 0;
  719. }
  720. BoolInt CPU_IsSupported_CRC32(void)
  721. {
  722. return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
  723. }
  724. BoolInt CPU_IsSupported_NEON(void)
  725. {
  726. return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
  727. }
  728. BoolInt CPU_IsSupported_SHA512(void)
  729. {
  730. return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
  731. }
  732. /*
  733. BoolInt CPU_IsSupported_SHA3(void)
  734. {
  735. return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
  736. }
  737. */
  738. #ifdef MY_CPU_ARM64
  739. #define APPLE_CRYPTO_SUPPORT_VAL 1
  740. #else
  741. #define APPLE_CRYPTO_SUPPORT_VAL 0
  742. #endif
  743. BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
  744. BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
  745. BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }
  746. #else // __APPLE__
  747. #if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216)
  748. #define Z7_GETAUXV_AVAILABLE
  749. #else
  750. // #pragma message("=== is not NEW GLIBC === ")
  751. #if defined __has_include
  752. #if __has_include (<sys/auxv.h>)
  753. // #pragma message("=== sys/auxv.h is avail=== ")
  754. #define Z7_GETAUXV_AVAILABLE
  755. #endif
  756. #endif
  757. #endif
  758. #ifdef Z7_GETAUXV_AVAILABLE
  759. // #pragma message("=== Z7_GETAUXV_AVAILABLE === ")
  760. #include <sys/auxv.h>
  761. #define USE_HWCAP
  762. #endif
  763. #ifdef USE_HWCAP
  764. #if defined(__FreeBSD__)
  765. static unsigned long MY_getauxval(int aux)
  766. {
  767. unsigned long val;
  768. if (elf_aux_info(aux, &val, sizeof(val)))
  769. return 0;
  770. return val;
  771. }
  772. #else
  773. #define MY_getauxval getauxval
  774. #if defined __has_include
  775. #if __has_include (<asm/hwcap.h>)
  776. #include <asm/hwcap.h>
  777. #endif
  778. #endif
  779. #endif
  780. #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \
  781. BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); }
  782. #ifdef MY_CPU_ARM64
  783. #define MY_HWCAP_CHECK_FUNC(name) \
  784. MY_HWCAP_CHECK_FUNC_2(name, name)
  785. #if 1 || defined(__ARM_NEON)
  786. BoolInt CPU_IsSupported_NEON(void) { return True; }
  787. #else
  788. MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)
  789. #endif
  790. // MY_HWCAP_CHECK_FUNC (ASIMD)
  791. #elif defined(MY_CPU_ARM)
  792. #define MY_HWCAP_CHECK_FUNC(name) \
  793. BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); }
  794. MY_HWCAP_CHECK_FUNC_2(NEON, NEON)
  795. #endif
  796. #else // USE_HWCAP
  797. #define MY_HWCAP_CHECK_FUNC(name) \
  798. BoolInt CPU_IsSupported_ ## name(void) { return 0; }
  799. #if defined(__ARM_NEON)
  800. BoolInt CPU_IsSupported_NEON(void) { return True; }
  801. #else
  802. MY_HWCAP_CHECK_FUNC(NEON)
  803. #endif
  804. #endif // USE_HWCAP
  805. MY_HWCAP_CHECK_FUNC (CRC32)
  806. MY_HWCAP_CHECK_FUNC (SHA1)
  807. MY_HWCAP_CHECK_FUNC (SHA2)
  808. MY_HWCAP_CHECK_FUNC (AES)
  809. #ifdef MY_CPU_ARM64
  810. // <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
  811. // we define them here, if they are not defined
  812. #ifndef HWCAP_SHA3
  813. // #define HWCAP_SHA3 (1 << 17)
  814. #endif
  815. #ifndef HWCAP_SHA512
  816. // #pragma message("=== HWCAP_SHA512 define === ")
  817. #define HWCAP_SHA512 (1 << 21)
  818. #endif
  819. MY_HWCAP_CHECK_FUNC (SHA512)
  820. // MY_HWCAP_CHECK_FUNC (SHA3)
  821. #endif
  822. #endif // __APPLE__
  823. #endif // _WIN32
  824. #endif // MY_CPU_ARM_OR_ARM64
  825. #ifdef __APPLE__
  826. #include <sys/sysctl.h>
  827. int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
  828. {
  829. return sysctlbyname(name, buf, bufSize, NULL, 0);
  830. }
  831. int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
  832. {
  833. size_t bufSize = sizeof(*val);
  834. const int res = z7_sysctlbyname_Get(name, val, &bufSize);
  835. if (res == 0 && bufSize != sizeof(*val))
  836. return EFAULT;
  837. return res;
  838. }
  839. #endif