Browse Source

* x86: The code generator will now attempt manipulate "x and (not y)" (where x and y are ordinals) to use ANDN.

J. Gareth "Curious Kit" Moreton 2 years ago
parent
commit
626e23d89f
1 changed files with 73 additions and 1 deletions
  1. 73 1
      compiler/x86/nx86add.pas

+ 73 - 1
compiler/x86/nx86add.pas

@@ -82,7 +82,7 @@ unit nx86add;
       tgobj,ncgutil,
       tgobj,ncgutil,
       ncon,nset,ninl,ncnv,ncal,nmat,
       ncon,nset,ninl,ncnv,ncal,nmat,
       defutil,defcmp,constexp,
       defutil,defcmp,constexp,
-      htypechk;
+      pass_2,htypechk;
 
 
 { Range check must be disabled explicitly as the code serves
 { Range check must be disabled explicitly as the code serves
   on three different architecture sizes }
   on three different architecture sizes }
@@ -1943,6 +1943,78 @@ unit nx86add;
 
 
        opsize:=def_cgsize(left.resultdef);
        opsize:=def_cgsize(left.resultdef);
 
 
+{$ifndef i8086}
+       { Bit-manipulation optimisations }
+       if (cs_opt_level2 in current_settings.optimizerswitches) and
+         (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
+         begin
+           { Can we turn "x and (not y)" into an ANDN instruction instead? }
+           if (nodetype = andn) and
+             (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
+             ((left.nodetype = notn) or (right.nodetype = notn)) and
+             (
+               { With "const and (not variable)", ANDN will produce larger
+                 code once everything is moved into registers (as a side-note,
+                 "const and (not const)" and "variable and (not const)" will
+                 have been simplified earlier to remove the NOT operation). }
+               not (cs_opt_size in current_settings.optimizerswitches) or
+               (
+                 (left.location.loc <> LOC_CONSTANT) and
+                 (right.location.loc <> LOC_CONSTANT)
+               )
+             ) then
+             begin
+               { ANDN only supports the second operand being inverted; however,
+                 since we're dealing with ordinals, there won't be any Boolean
+                 shortcutting, so we can safely swap the parameters }
+
+               if (right.nodetype <> notn) then
+                 swapleftright;
+
+               secondpass(left);
+               { Skip the not node completely }
+               secondpass(tnotnode(right).left);
+
+               { allocate registers }
+               hlcg.location_force_reg(
+                 current_asmdata.CurrAsmList,
+                 tnotnode(right).left.location,
+                 tnotnode(right).left.resultdef,
+                 tnotnode(right).left.resultdef,
+                 false
+               );
+
+               if left.location.loc = LOC_CONSTANT then
+                 { With "const and (not variable)", we can probably still make a
+                   saving when it comes to pipeline stalls (left.location.loc
+                   will become LOC_CREGISTER). }
+                 hlcg.location_force_reg(
+                   current_asmdata.CurrAsmList,
+                   left.location,
+                   left.resultdef,
+                   left.resultdef,
+                   true
+                 );
+
+               set_result_location_reg;
+
+               case left.location.loc of
+                 LOC_REFERENCE,
+                 LOC_CREFERENCE:
+                   emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register);
+                 LOC_REGISTER,
+                 LOC_CREGISTER:
+                   emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register)
+                 else
+                   InternalError(2022102101);
+               end;
+
+               { Overflow can't happen with and/andn }
+               Exit;
+             end;
+         end;
+{$endif not i8086}
+
        pass_left_right;
        pass_left_right;
 
 
        { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions
        { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions