set.inc 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2002 by the Free Pascal development team
  4. Include file with set operations called by the compiler
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$asmmode intel}
  12. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  13. procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
  14. { Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
  15. Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
  16. asm
  17. sub size, 16
  18. jl @Bytewise_Prepare { probably dead branch... }
  19. @16x_Loop:
  20. movdqu xmm0, xmmword ptr [set1 + size]
  21. movdqu xmm1, xmmword ptr [set2 + size]
  22. por xmm0, xmm1
  23. movdqu xmmword ptr [dest + size], xmm0
  24. sub size, 16
  25. ja @16x_Loop
  26. movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  27. movdqu xmm1, xmmword ptr [set2]
  28. por xmm0, xmm1
  29. movdqu xmmword ptr [dest], xmm0
  30. ret
  31. @Bytewise_Prepare:
  32. add size, 15
  33. @Bytewise_Loop:
  34. movzx eax, byte ptr [set1 + size]
  35. or al, byte ptr [set2 + size]
  36. mov byte ptr [dest + size], al
  37. sub size, 1
  38. jae @Bytewise_Loop
  39. end;
  40. {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  41. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
  42. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  43. procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
  44. { Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
  45. asm
  46. sub size, 16
  47. jl @Bytewise_Prepare { probably dead branch... }
  48. @16x_Loop:
  49. movdqu xmm0, xmmword ptr [set1 + size]
  50. movdqu xmm1, xmmword ptr [set2 + size]
  51. pand xmm0, xmm1
  52. movdqu xmmword ptr [dest + size], xmm0
  53. sub size, 16
  54. ja @16x_Loop
  55. movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  56. movdqu xmm1, xmmword ptr [set2]
  57. pand xmm0, xmm1
  58. movdqu xmmword ptr [dest], xmm0
  59. ret
  60. @Bytewise_Prepare:
  61. add size, 15
  62. @Bytewise_Loop:
  63. movzx eax, byte ptr [set1 + size]
  64. and al, byte ptr [set2 + size]
  65. mov byte ptr [dest + size], al
  66. sub size, 1
  67. jae @Bytewise_Loop
  68. end;
  69. {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  70. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
  71. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  72. procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
  73. { Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
  74. Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
  75. asm
  76. sub size, 16
  77. jl @Bytewise_Prepare { probably dead branch... }
  78. movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  79. movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  80. pandn xmm2, xmm1
  81. @16x_Loop:
  82. movdqu xmm1, xmmword ptr [set1 + size]
  83. movdqu xmm0, xmmword ptr [set2 + size]
  84. pandn xmm0, xmm1
  85. movdqu xmmword ptr [dest + size], xmm0
  86. sub size, 16
  87. ja @16x_Loop
  88. movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
  89. ret
  90. @Bytewise_Prepare:
  91. add size, 15
  92. @Bytewise_Loop:
  93. movzx eax, byte ptr [set2 + size]
  94. not eax
  95. and al, byte ptr [set1 + size]
  96. mov byte ptr [dest + size], al
  97. sub size, 1
  98. jae @Bytewise_Loop
  99. end;
  100. {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  101. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
  102. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  103. procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
  104. { Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
  105. Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
  106. Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
  107. asm
  108. sub size, 16
  109. jl @Bytewise_Prepare { probably dead branch... }
  110. movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
  111. movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
  112. pxor xmm2, xmm1
  113. @16x_Loop:
  114. movdqu xmm0, xmmword ptr [set1 + size]
  115. movdqu xmm1, xmmword ptr [set2 + size]
  116. pxor xmm0, xmm1
  117. movdqu xmmword ptr [dest + size], xmm0
  118. sub size, 16
  119. ja @16x_Loop
  120. movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
  121. ret
  122. @Bytewise_Prepare:
  123. add size, 15
  124. @Bytewise_Loop:
  125. movzx eax, byte ptr [set2 + size]
  126. xor al, byte ptr [set1 + size]
  127. mov byte ptr [dest + size], al
  128. sub size, 1
  129. jae @Bytewise_Loop
  130. end;
  131. {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  132. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
  133. {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  134. function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
  135. { Windows: rcx = set1, rdx = set2, r8 = size
  136. Linux: rdi = set1, rsi = set2, rdx = size }
  137. asm
  138. sub size, 16
  139. jl @Bytewise_Prepare { probably dead branch... }
  140. {$if false}
  141. { Scans 16 bytes at a time left to right with early exits.
  142. Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
  143. Kept for the future. }
  144. pxor xmm2, xmm2 { xmm2 = 0 }
  145. add set1, size
  146. add set2, size
  147. neg size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
  148. Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
  149. @16x_Loop:
  150. movdqu xmm1, xmmword ptr [set1 + size]
  151. movdqu xmm0, xmmword ptr [set2 + size]
  152. pandn xmm0, xmm1
  153. pcmpeqb xmm0, xmm2
  154. pmovmskb eax, xmm0
  155. inc ax
  156. jnz @No
  157. add size, 16
  158. js @16x_Loop
  159. movdqu xmm1, xmmword ptr [set1]
  160. movdqu xmm0, xmmword ptr [set2]
  161. pandn xmm0, xmm1
  162. {$else}
  163. { Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
  164. movdqu xmm1, xmmword ptr [set1]
  165. movdqu xmm2, xmmword ptr [set2]
  166. pandn xmm2, xmm1
  167. @16x_Loop:
  168. movdqu xmm1, xmmword ptr [set1 + size]
  169. movdqu xmm0, xmmword ptr [set2 + size]
  170. pandn xmm0, xmm1
  171. por xmm2, xmm0
  172. sub size, 16
  173. ja @16x_Loop
  174. pxor xmm0, xmm0
  175. {$endif}
  176. pcmpeqb xmm0, xmm2
  177. pmovmskb ecx, xmm0
  178. xor eax, eax
  179. inc cx
  180. setz al
  181. ret
  182. @No:
  183. xor eax, eax
  184. ret
  185. @Bytewise_Prepare:
  186. add size, 16
  187. neg size
  188. sub set1, size
  189. sub set2, size
  190. @Bytewise_Loop:
  191. movzx eax, byte ptr [set2 + size]
  192. not eax
  193. test byte ptr [set1 + size], al
  194. jnz @No
  195. inc size
  196. jnz @Bytewise_Loop
  197. mov eax, $1
  198. end;
  199. {$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  200. {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
  201. {$asmmode att}