Browse Source

* x86: "x and ((1 shl y) - 1)" to BZHI adapted for in_and_assign_x_y

J. Gareth "Curious Kit" Moreton 2 years ago
parent
commit
c40f518de4
1 changed files with 88 additions and 1 deletions
  1. 88 1
      compiler/x86/nx86inl.pas

+ 88 - 1
compiler/x86/nx86inl.pas

@@ -61,6 +61,7 @@ interface
           { second pass override to generate these nodes }
           procedure pass_generate_code_cpu;override;
           procedure second_IncludeExclude;override;
+          procedure second_AndOrXorShiftRot_assign;override;
           procedure second_pi; override;
           procedure second_arctan_real; override;
           procedure second_abs_real; override;
@@ -98,7 +99,7 @@ implementation
       htypechk,
       cgbase,pass_1,pass_2,
       cpuinfo,cpubase,nutils,
-      ncal,ncgutil,nld,ncon,
+      ncal,ncgutil,nld,ncon,nadd,nmat,constexp,
       tgobj,
       cga,cgutils,cgx86,cgobj,hlcgobj;
 
@@ -619,6 +620,92 @@ implementation
        end;
 
 
+     procedure tx86inlinenode.second_AndOrXorShiftRot_assign;
+       var
+         opsize : tcgsize;
+         valuenode, indexnode, loadnode: TNode;
+         DestReg: TRegister;
+       begin
+{$ifndef i8086}
+         if (cs_opt_level2 in current_settings.optimizerswitches) then
+           begin
+             { Saves on a lot of typecasting and potential coding mistakes }
+             valuenode := tcallparanode(left).left;
+             loadnode := tcallparanode(tcallparanode(left).right).left;
+
+             opsize := def_cgsize(loadnode.resultdef);
+
+             { BMI2 optimisations }
+             if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
+               begin
+                 { If the second operand is "((1 shl y) - 1)", we can turn it
+                   into a BZHI operator instead }
+                 if (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
+                   (valuenode.nodetype = subn) and
+                   (taddnode(valuenode).right.nodetype = ordconstn) and
+                   (tordconstnode(taddnode(valuenode).right).value = 1) and
+                   (taddnode(valuenode).left.nodetype = shln) and
+                   (tshlshrnode(taddnode(valuenode).left).left.nodetype = ordconstn) and
+                   (tordconstnode(tshlshrnode(taddnode(valuenode).left).left).value = 1) then
+                   begin
+                     { Skip the subtract and shift nodes completely }
+
+                     { Helps avoid all the awkward typecasts }
+                     indexnode := tshlshrnode(taddnode(valuenode).left).right;
+{$ifdef x86_64}
+                     { The code generator sometimes extends the shift result to 64-bit unnecessarily }
+                     if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and
+                       (def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then
+                       begin
+                         { Convert to the 32-bit type }
+                         indexnode.resultdef := loadnode.resultdef;
+                         node_reset_flags(indexnode,[nf_pass1_done]);
+
+                         { We should't be getting any new errors }
+                         if do_firstpass(indexnode) then
+                           InternalError(2022110202);
+
+                         { Keep things internally consistent in case indexnode changed }
+                         tshlshrnode(taddnode(valuenode).left).right := indexnode;
+                       end;
+{$endif x86_64}
+                     secondpass(indexnode);
+                     secondpass(loadnode);
+
+                     { allocate registers }
+                     hlcg.location_force_reg(
+                       current_asmdata.CurrAsmList,
+                       indexnode.location,
+                       indexnode.resultdef,
+                       loadnode.resultdef,
+                       false
+                     );
+
+                     case loadnode.location.loc of
+                       LOC_REFERENCE,
+                       LOC_CREFERENCE:
+                         begin
+                           { BZHI can only write to a register }
+                           DestReg := cg.getintregister(current_asmdata.CurrAsmList,opsize);
+                           emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, loadnode.location.reference, DestReg);
+                           emit_reg_ref(A_MOV, TCGSize2OpSize[opsize], DestReg, loadnode.location.reference);
+                         end;
+                       LOC_REGISTER,
+                       LOC_CREGISTER:
+                         emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, loadnode.location.register, loadnode.location.register);
+                       else
+                         InternalError(2022102110);
+                     end;
+
+                     Exit;
+                   end;
+               end;
+           end;
+{$endif not i8086}
+
+         inherited second_AndOrXorShiftRot_assign;
+       end;
+
      procedure tx86inlinenode.second_pi;
        begin
          location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));