set.inc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team
  4. Include file with set operations called by the compiler
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  12. {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  13. procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  14. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  15. asm
  16. push %ebx
  17. push %esi
  18. mov 12(%esp), %esi { esi = size }
  19. sub $4, %esi
  20. jl .LBytewise_Prepare { probably dead branch... }
  21. .L4x_Loop:
  22. mov (%eax,%esi), %ebx
  23. or (%edx,%esi), %ebx
  24. mov %ebx, (%ecx,%esi)
  25. sub $4, %esi
  26. ja .L4x_Loop
  27. mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  28. or (%edx), %ebx
  29. mov %ebx, (%ecx)
  30. pop %esi
  31. pop %ebx
  32. ret $4
  33. .LBytewise_Prepare:
  34. add $3, %esi
  35. .LBytewise_Loop:
  36. movzbl (%eax,%esi), %ebx
  37. or (%edx,%esi), %bl
  38. mov %bl, (%ecx,%esi)
  39. sub $1, %esi
  40. jae .LBytewise_Loop
  41. pop %esi
  42. pop %ebx
  43. end;
  44. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_add_sets {$else} fpc_varset_add_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  45. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  46. asm
  47. push %ebx
  48. mov 8(%esp), %ebx
  49. sub $16, %ebx { ebx = position }
  50. jl .LFallback { Hopefully dead branch... }
  51. .L16x_Loop:
  52. movups (%eax,%ebx), %xmm0
  53. movups (%edx,%ebx), %xmm1
  54. orps %xmm1, %xmm0
  55. movups %xmm0, (%ecx,%ebx)
  56. sub $16, %ebx
  57. ja .L16x_Loop
  58. movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  59. movups (%edx), %xmm1
  60. orps %xmm1, %xmm0
  61. movups %xmm0, (%ecx)
  62. pop %ebx
  63. ret $4
  64. .LFallback:
  65. pop %ebx
  66. jmp fpc_varset_add_sets_plain
  67. end;
  68. {$ifndef CPUX86_HAS_SSEUNIT}
  69. procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  70. var
  71. fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch;
  72. procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint);
  73. begin
  74. if has_sse_support then
  75. fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse
  76. else
  77. fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain;
  78. fpc_varset_add_sets_impl(set1,set2,dest,size);
  79. end;
  80. procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  81. begin
  82. fpc_varset_add_sets_impl(set1,set2,dest,size);
  83. end;
  84. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_add_sets dispatcher)}
  85. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  86. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  87. {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  88. procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  89. { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
  90. asm
  91. push %ebx
  92. push %esi
  93. mov 12(%esp), %esi { esi = size }
  94. sub $4, %esi
  95. jl .LBytewise_Prepare { probably dead branch... }
  96. .L4x_Loop:
  97. mov (%eax,%esi), %ebx
  98. and (%edx,%esi), %ebx
  99. mov %ebx, (%ecx,%esi)
  100. sub $4, %esi
  101. ja .L4x_Loop
  102. mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  103. and (%edx), %ebx
  104. mov %ebx, (%ecx)
  105. pop %esi
  106. pop %ebx
  107. ret $4
  108. .LBytewise_Prepare:
  109. add $3, %esi
  110. .LBytewise_Loop:
  111. movzbl (%eax,%esi), %ebx
  112. and (%edx,%esi), %bl
  113. mov %bl, (%ecx,%esi)
  114. sub $1, %esi
  115. jae .LBytewise_Loop
  116. pop %esi
  117. pop %ebx
  118. end;
  119. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_mul_sets {$else} fpc_varset_mul_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  120. { Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. }
  121. asm
  122. push %ebx
  123. mov 8(%esp), %ebx
  124. sub $16, %ebx { ebx = position }
  125. jl .LFallback { Hopefully dead branch... }
  126. .L16x_Loop:
  127. movups (%eax,%ebx), %xmm0
  128. movups (%edx,%ebx), %xmm1
  129. andps %xmm1, %xmm0
  130. movups %xmm0, (%ecx,%ebx)
  131. sub $16, %ebx
  132. ja .L16x_Loop
  133. movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  134. movups (%edx), %xmm1
  135. andps %xmm1, %xmm0
  136. movups %xmm0, (%ecx)
  137. pop %ebx
  138. ret $4
  139. .LFallback:
  140. pop %ebx
  141. jmp fpc_varset_mul_sets_plain
  142. end;
  143. {$ifndef CPUX86_HAS_SSEUNIT}
  144. procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  145. var
  146. fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch;
  147. procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint);
  148. begin
  149. if has_sse_support then
  150. fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse
  151. else
  152. fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain;
  153. fpc_varset_mul_sets_impl(set1,set2,dest,size);
  154. end;
  155. procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  156. begin
  157. fpc_varset_mul_sets_impl(set1,set2,dest,size);
  158. end;
  159. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_mul_sets dispatcher)}
  160. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  161. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  162. {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  163. procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  164. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  165. asm
  166. push %ebx
  167. push %esi
  168. mov 12(%esp), %esi { esi = size }
  169. sub $4, %esi
  170. jl .LBytewise_Prepare { probably dead branch... }
  171. mov (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  172. not %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  173. and (%eax), %ebx
  174. push %ebx
  175. .L4x_Loop:
  176. mov (%edx,%esi), %ebx
  177. not %ebx
  178. and (%eax,%esi), %ebx
  179. mov %ebx, (%ecx,%esi)
  180. sub $4, %esi
  181. ja .L4x_Loop
  182. pop %ebx
  183. mov %ebx, (%ecx) { Write precalculated tail. }
  184. pop %esi
  185. pop %ebx
  186. ret $4
  187. .LBytewise_Prepare:
  188. add $3, %esi
  189. .LBytewise_Loop:
  190. movzbl (%edx,%esi), %ebx
  191. not %ebx
  192. and (%eax,%esi), %bl
  193. mov %bl, (%ecx,%esi)
  194. sub $1, %esi
  195. jae .LBytewise_Loop
  196. pop %esi
  197. pop %ebx
  198. end;
  199. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_sub_sets {$else} fpc_varset_sub_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  200. { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  201. asm
  202. push %ebx
  203. mov 8(%esp), %ebx
  204. sub $16, %ebx { ebx = position }
  205. jl .LFallback { Hopefully dead branch... }
  206. movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  207. movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  208. andnps %xmm1, %xmm2
  209. .L16x_Loop:
  210. movups (%eax,%ebx), %xmm1
  211. movups (%edx,%ebx), %xmm0
  212. andnps %xmm1, %xmm0
  213. movups %xmm0, (%ecx,%ebx)
  214. sub $16, %ebx
  215. ja .L16x_Loop
  216. movups %xmm2, (%ecx) { Write precalculated tail. }
  217. pop %ebx
  218. ret $4
  219. .LFallback:
  220. pop %ebx
  221. jmp fpc_varset_sub_sets_plain
  222. end;
  223. {$ifndef CPUX86_HAS_SSEUNIT}
  224. procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  225. var
  226. fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch;
  227. procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint);
  228. begin
  229. if has_sse_support then
  230. fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse
  231. else
  232. fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain;
  233. fpc_varset_sub_sets_impl(set1,set2,dest,size);
  234. end;
  235. procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  236. begin
  237. fpc_varset_sub_sets_impl(set1,set2,dest,size);
  238. end;
  239. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_sub_sets dispatcher)}
  240. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  241. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  242. {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  243. procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
  244. { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
  245. eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  246. asm
  247. push %ebx
  248. push %esi
  249. mov 12(%esp), %esi { esi = size }
  250. sub $4, %esi
  251. jl .LBytewise_Prepare { probably dead branch... }
  252. mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
  253. xor (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  254. push %ebx
  255. .L4x_Loop:
  256. mov (%eax,%esi), %ebx
  257. xor (%edx,%esi), %ebx
  258. mov %ebx, (%ecx,%esi)
  259. sub $4, %esi
  260. ja .L4x_Loop
  261. pop %ebx
  262. mov %ebx, (%ecx) { Write precalculated tail. }
  263. pop %esi
  264. pop %ebx
  265. ret $4
  266. .LBytewise_Prepare:
  267. add $3, %esi
  268. .LBytewise_Loop:
  269. movzbl (%eax,%esi), %ebx
  270. xor (%edx,%esi), %bl
  271. mov %bl, (%ecx,%esi)
  272. sub $1, %esi
  273. jae .LBytewise_Loop
  274. pop %esi
  275. pop %ebx
  276. end;
  277. procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_symdif_sets {$else} fpc_varset_symdif_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
  278. { Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'.
  279. eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
  280. asm
  281. push %ebx
  282. mov 8(%esp), %ebx
  283. sub $16, %ebx { ebx = position }
  284. jl .LFallback { Hopefully dead branch... }
  285. movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  286. movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  287. xorps %xmm1, %xmm2
  288. .L16x_Loop:
  289. movups (%eax,%ebx), %xmm1
  290. movups (%edx,%ebx), %xmm0
  291. xorps %xmm1, %xmm0
  292. movups %xmm0, (%ecx,%ebx)
  293. sub $16, %ebx
  294. ja .L16x_Loop
  295. movups %xmm2, (%ecx) { Write precalculated tail. }
  296. pop %ebx
  297. ret $4
  298. .LFallback:
  299. pop %ebx
  300. jmp fpc_varset_symdif_sets_plain
  301. end;
  302. {$ifndef CPUX86_HAS_SSEUNIT}
  303. procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
  304. var
  305. fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch;
  306. procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint);
  307. begin
  308. if has_sse_support then
  309. fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse
  310. else
  311. fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain;
  312. fpc_varset_symdif_sets_impl(set1,set2,dest,size);
  313. end;
  314. procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
  315. begin
  316. fpc_varset_symdif_sets_impl(set1,set2,dest,size);
  317. end;
  318. {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_symdif_sets dispatcher)}
  319. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  320. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  321. {$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  322. function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
  323. { eax = set1, edx = set2, ecx = size }
  324. asm
  325. push %ebx
  326. sub $4, %ecx
  327. jl .LBytewise_Prepare { probably dead branch... }
  328. add %ecx, %eax
  329. add %ecx, %edx
  330. neg %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
  331. .L4x_Loop:
  332. mov (%edx,%ecx), %ebx
  333. not %ebx
  334. test %ebx, (%eax,%ecx)
  335. jnz .LNo
  336. add $4, %ecx
  337. js .L4x_Loop
  338. mov (%edx), %ebx { Tail. }
  339. not %ebx
  340. mov %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
  341. xor %eax, %eax
  342. test %ebx, (%ecx)
  343. setz %al
  344. pop %ebx
  345. ret
  346. .LNo:
  347. xor %eax, %eax
  348. pop %ebx
  349. ret
  350. .LBytewise_Prepare:
  351. add $4, %ecx
  352. neg %ecx
  353. sub %ecx, %eax
  354. sub %ecx, %edx
  355. .LBytewise_Loop:
  356. movzbl (%edx,%ecx), %ebx
  357. not %ebx
  358. test %bl, (%eax,%ecx)
  359. jnz .LNo
  360. inc %ecx
  361. jnz .LBytewise_Loop
  362. mov $1, %eax
  363. pop %ebx
  364. end;
  365. function {$ifdef CPUX86_HAS_SSE2} fpc_varset_contains_sets {$else} fpc_varset_contains_sets_sse2 {$endif} (const set1,set2;size : ptrint):boolean; assembler; nostackframe; {$ifdef CPUX86_HAS_SSE2} compilerproc; {$endif}
  366. { eax = set1, edx = set2, ecx = size }
  367. asm
  368. sub $16, %ecx
  369. jl .LFallback { probably dead branch... }
  370. { Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
  371. movdqu (%eax), %xmm1
  372. movdqu (%edx), %xmm2
  373. pandn %xmm1, %xmm2
  374. .L16x_Loop:
  375. movdqu (%eax,%ecx), %xmm1
  376. movdqu (%edx,%ecx), %xmm0
  377. pandn %xmm1, %xmm0
  378. por %xmm0, %xmm2
  379. sub $16, %ecx
  380. ja .L16x_Loop
  381. pxor %xmm0, %xmm0
  382. pcmpeqb %xmm2,%xmm0
  383. pmovmskb %xmm0, %ecx
  384. xor %eax, %eax
  385. inc %cx
  386. setz %al
  387. ret
  388. .LFallback:
  389. add $16, %ecx
  390. jmp fpc_varset_contains_sets_plain
  391. end;
  392. {$ifndef CPUX86_HAS_SSE2}
  393. function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward;
  394. var
  395. fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch;
  396. function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean;
  397. begin
  398. if has_sse2_support then
  399. fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2
  400. else
  401. fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain;
  402. result:=fpc_varset_contains_sets_impl(set1,set2,size);
  403. end;
  404. function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline;
  405. begin
  406. result:=fpc_varset_contains_sets_impl(set1,set2,size);
  407. end;
  408. {$endif ndef CPUX86_HAS_SSE2 (need fpc_varset_contains_sets dispatcher)}
  409. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}