set.inc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team
  4. Include file with set operations called by the compiler
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  12. {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  13. label
  14. fpc_varset_add_sets_plain_fallback;
  15. procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  16. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  17. asm
  18. push %ebx
  19. fpc_varset_add_sets_plain_fallback:
  20. push %esi
  21. mov 12(%esp), %esi { esi = size }
  22. sub $4, %esi
  23. jl .LBytewise_Prepare { probably dead branch... }
  24. .L4x_Loop:
  25. mov (%eax,%esi), %ebx
  26. or (%edx,%esi), %ebx
  27. mov %ebx, (%ecx,%esi)
  28. sub $4, %esi
  29. ja .L4x_Loop
  30. mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  31. or (%edx), %ebx
  32. mov %ebx, (%ecx)
  33. pop %esi
  34. pop %ebx
  35. ret $4
  36. .LBytewise_Prepare:
  37. add $3, %esi
  38. .LBytewise_Loop:
  39. movzbl (%eax,%esi), %ebx
  40. or (%edx,%esi), %bl
  41. mov %bl, (%ecx,%esi)
  42. sub $1, %esi
  43. jae .LBytewise_Loop
  44. pop %esi
  45. pop %ebx
  46. end;
  47. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_add_sets {$else} fpc_varset_add_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  48. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  49. asm
  50. push %ebx
  51. mov 8(%esp), %ebx
  52. sub $16, %ebx { ebx = position }
  53. jl fpc_varset_add_sets_plain_fallback { probably dead branch... }
  54. .L16x_Loop:
  55. movups (%eax,%ebx), %xmm0
  56. movups (%edx,%ebx), %xmm1
  57. orps %xmm1, %xmm0
  58. movups %xmm0, (%ecx,%ebx)
  59. sub $16, %ebx
  60. ja .L16x_Loop
  61. movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  62. movups (%edx), %xmm1
  63. orps %xmm1, %xmm0
  64. movups %xmm0, (%ecx)
  65. pop %ebx
  66. end;
  67. {$ifndef CPUX86_HAS_SSEUNIT}
  68. procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  69. var
  70. fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch;
  71. procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint);
  72. begin
  73. if has_sse_support then
  74. fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse
  75. else
  76. fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain;
  77. fpc_varset_add_sets_impl(set1,set2,dest,size);
  78. end;
  79. procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  80. begin
  81. fpc_varset_add_sets_impl(set1,set2,dest,size);
  82. end;
  83. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_add_sets dispatcher)}
  84. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  85. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  86. {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  87. label
  88. fpc_varset_mul_sets_plain_fallback;
  89. procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  90. { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
  91. asm
  92. push %ebx
  93. fpc_varset_mul_sets_plain_fallback:
  94. push %esi
  95. mov 12(%esp), %esi { esi = size }
  96. sub $4, %esi
  97. jl .LBytewise_Prepare { probably dead branch... }
  98. .L4x_Loop:
  99. mov (%eax,%esi), %ebx
  100. and (%edx,%esi), %ebx
  101. mov %ebx, (%ecx,%esi)
  102. sub $4, %esi
  103. ja .L4x_Loop
  104. mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  105. and (%edx), %ebx
  106. mov %ebx, (%ecx)
  107. pop %esi
  108. pop %ebx
  109. ret $4
  110. .LBytewise_Prepare:
  111. add $3, %esi
  112. .LBytewise_Loop:
  113. movzbl (%eax,%esi), %ebx
  114. and (%edx,%esi), %bl
  115. mov %bl, (%ecx,%esi)
  116. sub $1, %esi
  117. jae .LBytewise_Loop
  118. pop %esi
  119. pop %ebx
  120. end;
  121. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_mul_sets {$else} fpc_varset_mul_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  122. { Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. }
  123. asm
  124. push %ebx
  125. mov 8(%esp), %ebx
  126. sub $16, %ebx { ebx = position }
  127. jl fpc_varset_mul_sets_plain_fallback { probably dead branch... }
  128. .L16x_Loop:
  129. movups (%eax,%ebx), %xmm0
  130. movups (%edx,%ebx), %xmm1
  131. andps %xmm1, %xmm0
  132. movups %xmm0, (%ecx,%ebx)
  133. sub $16, %ebx
  134. ja .L16x_Loop
  135. movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  136. movups (%edx), %xmm1
  137. andps %xmm1, %xmm0
  138. movups %xmm0, (%ecx)
  139. pop %ebx
  140. end;
  141. {$ifndef CPUX86_HAS_SSEUNIT}
  142. procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  143. var
  144. fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch;
  145. procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint);
  146. begin
  147. if has_sse_support then
  148. fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse
  149. else
  150. fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain;
  151. fpc_varset_mul_sets_impl(set1,set2,dest,size);
  152. end;
  153. procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  154. begin
  155. fpc_varset_mul_sets_impl(set1,set2,dest,size);
  156. end;
  157. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_mul_sets dispatcher)}
  158. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  159. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  160. {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  161. label
  162. fpc_varset_sub_sets_plain_fallback;
  163. procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  164. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  165. asm
  166. push %ebx
  167. fpc_varset_sub_sets_plain_fallback:
  168. push %esi
  169. mov 12(%esp), %esi { esi = size }
  170. sub $4, %esi
  171. jl .LBytewise_Prepare { probably dead branch... }
  172. mov (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  173. not %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  174. and (%eax), %ebx
  175. push %ebx
  176. .L4x_Loop:
  177. mov (%edx,%esi), %ebx
  178. not %ebx
  179. and (%eax,%esi), %ebx
  180. mov %ebx, (%ecx,%esi)
  181. sub $4, %esi
  182. ja .L4x_Loop
  183. pop %ebx
  184. mov %ebx, (%ecx) { Write precalculated tail. }
  185. pop %esi
  186. pop %ebx
  187. ret $4
  188. .LBytewise_Prepare:
  189. add $3, %esi
  190. .LBytewise_Loop:
  191. movzbl (%edx,%esi), %ebx
  192. not %ebx
  193. and (%eax,%esi), %bl
  194. mov %bl, (%ecx,%esi)
  195. sub $1, %esi
  196. jae .LBytewise_Loop
  197. pop %esi
  198. pop %ebx
  199. end;
  200. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_sub_sets {$else} fpc_varset_sub_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  201. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  202. asm
  203. push %ebx
  204. mov 8(%esp), %ebx
  205. sub $16, %ebx { ebx = position }
  206. jl fpc_varset_sub_sets_plain_fallback { probably dead branch... }
  207. movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  208. movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  209. andnps %xmm1, %xmm2
  210. .L16x_Loop:
  211. movups (%eax,%ebx), %xmm1
  212. movups (%edx,%ebx), %xmm0
  213. andnps %xmm1, %xmm0
  214. movups %xmm0, (%ecx,%ebx)
  215. sub $16, %ebx
  216. ja .L16x_Loop
  217. movups %xmm2, (%ecx) { Write precalculated tail. }
  218. pop %ebx
  219. end;
  220. {$ifndef CPUX86_HAS_SSEUNIT}
  221. procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  222. var
  223. fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch;
  224. procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint);
  225. begin
  226. if has_sse_support then
  227. fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse
  228. else
  229. fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain;
  230. fpc_varset_sub_sets_impl(set1,set2,dest,size);
  231. end;
  232. procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  233. begin
  234. fpc_varset_sub_sets_impl(set1,set2,dest,size);
  235. end;
  236. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_sub_sets dispatcher)}
  237. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  238. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  239. {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  240. label
  241. fpc_varset_symdif_sets_plain_fallback;
  242. procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  243. { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
  244. eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  245. asm
  246. push %ebx
  247. fpc_varset_symdif_sets_plain_fallback:
  248. push %esi
  249. mov 12(%esp), %esi { esi = size }
  250. sub $4, %esi
  251. jl .LBytewise_Prepare { probably dead branch... }
  252. mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  253. xor (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  254. push %ebx
  255. .L4x_Loop:
  256. mov (%eax,%esi), %ebx
  257. xor (%edx,%esi), %ebx
  258. mov %ebx, (%ecx,%esi)
  259. sub $4, %esi
  260. ja .L4x_Loop
  261. pop %ebx
  262. mov %ebx, (%ecx) { Write precalculated tail. }
  263. pop %esi
  264. pop %ebx
  265. ret $4
  266. .LBytewise_Prepare:
  267. add $3, %esi
  268. .LBytewise_Loop:
  269. movzbl (%eax,%esi), %ebx
  270. xor (%edx,%esi), %bl
  271. mov %bl, (%ecx,%esi)
  272. sub $1, %esi
  273. jae .LBytewise_Loop
  274. pop %esi
  275. pop %ebx
  276. end;
  277. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_symdif_sets {$else} fpc_varset_symdif_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  278. { Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'.
  279. eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  280. asm
  281. push %ebx
  282. mov 8(%esp), %ebx
  283. sub $16, %ebx { ebx = position }
  284. jl fpc_varset_symdif_sets_plain_fallback { probably dead branch... }
  285. movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  286. movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  287. xorps %xmm1, %xmm2
  288. .L16x_Loop:
  289. movups (%eax,%ebx), %xmm1
  290. movups (%edx,%ebx), %xmm0
  291. xorps %xmm1, %xmm0
  292. movups %xmm0, (%ecx,%ebx)
  293. sub $16, %ebx
  294. ja .L16x_Loop
  295. movups %xmm2, (%ecx) { Write precalculated tail. }
  296. pop %ebx
  297. end;
  298. {$ifndef CPUX86_HAS_SSEUNIT}
  299. procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  300. var
  301. fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch;
  302. procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint);
  303. begin
  304. if has_sse_support then
  305. fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse
  306. else
  307. fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain;
  308. fpc_varset_symdif_sets_impl(set1,set2,dest,size);
  309. end;
  310. procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  311. begin
  312. fpc_varset_symdif_sets_impl(set1,set2,dest,size);
  313. end;
  314. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_symdif_sets dispatcher)}
  315. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  316. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  317. {$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  318. function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
  319. { eax = set1, edx = set2, ecx = size }
  320. asm
  321. push %ebx
  322. sub $4, %ecx
  323. jl .LBytewise_Prepare { probably dead branch... }
  324. add %ecx, %eax
  325. add %ecx, %edx
  326. neg %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
  327. .L4x_Loop:
  328. mov (%edx,%ecx), %ebx
  329. not %ebx
  330. test %ebx, (%eax,%ecx)
  331. jnz .LNo
  332. add $4, %ecx
  333. js .L4x_Loop
  334. mov (%edx), %ebx { Tail. }
  335. not %ebx
  336. mov %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
  337. xor %eax, %eax
  338. test %ebx, (%ecx)
  339. setz %al
  340. pop %ebx
  341. ret
  342. .LNo:
  343. xor %eax, %eax
  344. pop %ebx
  345. ret
  346. .LBytewise_Prepare:
  347. add $4, %ecx
  348. neg %ecx
  349. sub %ecx, %eax
  350. sub %ecx, %edx
  351. .LBytewise_Loop:
  352. movzbl (%edx,%ecx), %ebx
  353. not %ebx
  354. test %bl, (%eax,%ecx)
  355. jnz .LNo
  356. inc %ecx
  357. jnz .LBytewise_Loop
  358. mov $1, %eax
  359. pop %ebx
  360. end;
  361. function {$ifdef CPUX86_HAS_SSE2} fpc_varset_contains_sets {$else} fpc_varset_contains_sets_sse2 {$endif} (const set1,set2;size : ptrint):boolean; assembler; nostackframe; {$ifdef CPUX86_HAS_SSE2} compilerproc; {$endif}
  362. { eax = set1, edx = set2, ecx = size }
  363. asm
  364. sub $16, %ecx
  365. jl .LFallback { probably dead branch... }
  366. { Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
  367. movdqu (%eax), %xmm1
  368. movdqu (%edx), %xmm2
  369. pandn %xmm1, %xmm2
  370. .L16x_Loop:
  371. movdqu (%eax,%ecx), %xmm1
  372. movdqu (%edx,%ecx), %xmm0
  373. pandn %xmm1, %xmm0
  374. por %xmm0, %xmm2
  375. sub $16, %ecx
  376. ja .L16x_Loop
  377. pxor %xmm0, %xmm0
  378. pcmpeqb %xmm2,%xmm0
  379. pmovmskb %xmm0, %ecx
  380. xor %eax, %eax
  381. inc %cx
  382. setz %al
  383. ret
  384. .LFallback:
  385. add $16, %ecx
  386. jmp fpc_varset_contains_sets_plain
  387. end;
  388. {$ifndef CPUX86_HAS_SSE2}
  389. function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward;
  390. var
  391. fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch;
  392. function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean;
  393. begin
  394. if has_sse2_support then
  395. fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2
  396. else
  397. fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain;
  398. result:=fpc_varset_contains_sets_impl(set1,set2,size);
  399. end;
  400. function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline;
  401. begin
  402. result:=fpc_varset_contains_sets_impl(set1,set2,size);
  403. end;
  404. {$endif ndef CPUX86_HAS_SSE2 (need fpc_varset_contains_sets dispatcher)}
  405. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}