浏览代码

Merge branch 'main' of gitlab.com:freepascal.org/fpc/source into main

Michaël Van Canneyt 4 年之前
父节点
当前提交
5022d0e772
共有 4 个文件被更改,包括 242 次插入179 次删除
  1. 9 1
      compiler/nld.pas
  2. 73 62
      compiler/optloadmodifystore.pas
  3. 157 113
      compiler/x86/aoptx86.pas
  4. 3 3
      rtl/x86_64/math.inc

+ 9 - 1
compiler/nld.pas

@@ -197,7 +197,8 @@ implementation
       cpuinfo,
       htypechk,pass_1,procinfo,paramgr,
       nbas,ncon,nflw,ninl,ncnv,nmem,ncal,nutils,
-      cgbase
+      cgbase,
+      optloadmodifystore
       ;
 
 
@@ -625,6 +626,13 @@ implementation
            is_constrealnode(right) and
            not equal_defs(right.resultdef,left.resultdef) then
           inserttypeconv(right,left.resultdef);
+{$if (cs_opt_use_load_modify_store in supported_optimizerswitches)}
+        { Perform simple optimizations when -O2 and the dedicated
+          cs_opt_use_load_modify_store optimization pass is not enabled. }
+        if (cs_opt_level2 in current_settings.optimizerswitches) and
+           not (cs_opt_use_load_modify_store in current_settings.optimizerswitches) then
+          result:=try_opt_assignmentnode(self);
+{$endif}
       end;
 
 

+ 73 - 62
compiler/optloadmodifystore.pas

@@ -38,16 +38,17 @@ unit optloadmodifystore;
   interface
 
     uses
-      node;
+      node,nld;
 
     procedure do_optloadmodifystore(var rootnode : tnode);
+    function try_opt_assignmentnode(assignmentnode : tassignmentnode): tnode;
 
   implementation
 
     uses
-      globtype,verbose,nutils,compinnr,
+      globtype,globals,verbose,nutils,compinnr,
       defutil,defcmp,htypechk,pass_1,constexp,
-      nadd,ncal,ncon,ncnv,ninl,nld,nmat,
+      nadd,ncal,ncon,ncnv,ninl,nmat,
       symdef;
 
     function try_opt_assignmentnode(assignmentnode: tassignmentnode): tnode;
@@ -57,6 +58,10 @@ unit optloadmodifystore;
         result:=nil;
         with assignmentnode do
           begin
+            { *** Here are simple optimizations which are performed
+              when -O2 (via a call from tassignmentnode.simplify) or
+              when cs_opt_use_load_modify_store is enabled (in a separate pass).
+            }
             { replace i:=succ/pred(i) by inc/dec(i)? }
             if (right.nodetype=inlinen) and
               ((tinlinenode(right).inlinenumber=in_succ_x) or (tinlinenode(right).inlinenumber=in_pred_x)) and
@@ -273,6 +278,71 @@ unit optloadmodifystore;
                 taddnode(ttypeconvnode(right).left).left:=nil;
                 exit;
               end;
+            { replace i:=not i  by in_not_assign_x(i)
+                      i:=-i     by in_neg_assign_x(i)
+
+              this handles the case, where there are no implicit type conversions }
+            if (right.nodetype in [notn,unaryminusn]) and
+              (tunarynode(right).left.isequal(left)) and
+              is_integer(tunarynode(right).left.resultdef) and
+              ((localswitches*[cs_check_overflow,cs_check_range])=[]) and
+              ((right.localswitches*[cs_check_overflow,cs_check_range])=[]) and
+              valid_for_var(tunarynode(right).left,false) and
+              not(might_have_sideeffects(tunarynode(right).left)) then
+              begin
+                if right.nodetype=notn then
+                  newinlinenodetype:=in_not_assign_x
+                else
+                  newinlinenodetype:=in_neg_assign_x;
+                result:=cinlinenode.createintern(
+                  newinlinenodetype,false,tunarynode(right).left);
+                result.localswitches:=localswitches;
+                result.fileinfo:=fileinfo;
+                result.verbosity:=verbosity;
+                tunarynode(right).left:=nil;
+                exit;
+              end;
+            { replace i:=not i  by in_not_assign_x(i)
+                      i:=-i     by in_neg_assign_x(i)
+
+              this handles the case with type conversions:
+                   outer typeconv: right
+                          neg/not: ttypeconvnode(right).left
+                   inner typeconv: tunarynode(ttypeconvnode(right).left).left
+                   right side 'i': ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left }
+            if (right.nodetype=typeconvn) and
+               (ttypeconvnode(right).convtype=tc_int_2_int) and
+               (ttypeconvnode(right).left.nodetype in [notn,unaryminusn]) and
+               is_integer(ttypeconvnode(right).left.resultdef) and
+               (right.resultdef.size<=ttypeconvnode(right).left.resultdef.size) and
+               (tunarynode(ttypeconvnode(right).left).left.nodetype=typeconvn) and
+               (ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).convtype=tc_int_2_int) and
+               are_equal_ints(right.resultdef,ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left.resultdef) and
+               ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left.isequal(left) and
+               is_integer(ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left.resultdef) and
+               ((localswitches*[cs_check_overflow,cs_check_range])=[]) and
+               ((right.localswitches*[cs_check_overflow,cs_check_range])=[]) and
+               valid_for_var(ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left,false) and
+               not(might_have_sideeffects(ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left)) then
+              begin
+                if ttypeconvnode(right).left.nodetype=notn then
+                  newinlinenodetype:=in_not_assign_x
+                else
+                  newinlinenodetype:=in_neg_assign_x;
+                result:=cinlinenode.createintern(
+                  newinlinenodetype,false,ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left);
+                result.localswitches:=localswitches;
+                result.fileinfo:=fileinfo;
+                result.verbosity:=verbosity;
+                ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left:=nil;
+                exit;
+              end;
+
+            if not (cs_opt_use_load_modify_store in current_settings.optimizerswitches) then
+              exit;
+            { *** Here are more complex optimizations which are performed only
+              when cs_opt_use_load_modify_store is enabled.
+            }
 {$ifdef enable_shl_shr_assign_x_y}
             { replace i:=i shl k by in_shl_assign_x_y(i,k)
                       i:=i shr k by in_shr_assign_x_y(i,k)
@@ -555,65 +625,6 @@ unit optloadmodifystore;
                 exit;
               end;
 {$endif enable_sar_assign_x_y or enable_rox_assign_x_y}
-            { replace i:=not i  by in_not_assign_x(i)
-                      i:=-i     by in_neg_assign_x(i)
-
-              this handles the case, where there are no implicit type conversions }
-            if (right.nodetype in [notn,unaryminusn]) and
-              (tunarynode(right).left.isequal(left)) and
-              is_integer(tunarynode(right).left.resultdef) and
-              ((localswitches*[cs_check_overflow,cs_check_range])=[]) and
-              ((right.localswitches*[cs_check_overflow,cs_check_range])=[]) and
-              valid_for_var(tunarynode(right).left,false) and
-              not(might_have_sideeffects(tunarynode(right).left)) then
-              begin
-                if right.nodetype=notn then
-                  newinlinenodetype:=in_not_assign_x
-                else
-                  newinlinenodetype:=in_neg_assign_x;
-                result:=cinlinenode.createintern(
-                  newinlinenodetype,false,tunarynode(right).left);
-                result.localswitches:=localswitches;
-                result.fileinfo:=fileinfo;
-                result.verbosity:=verbosity;
-                tunarynode(right).left:=nil;
-                exit;
-              end;
-            { replace i:=not i  by in_not_assign_x(i)
-                      i:=-i     by in_neg_assign_x(i)
-
-              this handles the case with type conversions:
-                   outer typeconv: right
-                          neg/not: ttypeconvnode(right).left
-                   inner typeconv: tunarynode(ttypeconvnode(right).left).left
-                   right side 'i': ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left }
-            if (right.nodetype=typeconvn) and
-               (ttypeconvnode(right).convtype=tc_int_2_int) and
-               (ttypeconvnode(right).left.nodetype in [notn,unaryminusn]) and
-               is_integer(ttypeconvnode(right).left.resultdef) and
-               (right.resultdef.size<=ttypeconvnode(right).left.resultdef.size) and
-               (tunarynode(ttypeconvnode(right).left).left.nodetype=typeconvn) and
-               (ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).convtype=tc_int_2_int) and
-               are_equal_ints(right.resultdef,ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left.resultdef) and
-               ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left.isequal(left) and
-               is_integer(ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left.resultdef) and
-               ((localswitches*[cs_check_overflow,cs_check_range])=[]) and
-               ((right.localswitches*[cs_check_overflow,cs_check_range])=[]) and
-               valid_for_var(ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left,false) and
-               not(might_have_sideeffects(ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left)) then
-              begin
-                if ttypeconvnode(right).left.nodetype=notn then
-                  newinlinenodetype:=in_not_assign_x
-                else
-                  newinlinenodetype:=in_neg_assign_x;
-                result:=cinlinenode.createintern(
-                  newinlinenodetype,false,ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left);
-                result.localswitches:=localswitches;
-                result.fileinfo:=fileinfo;
-                result.verbosity:=verbosity;
-                ttypeconvnode(tunarynode(ttypeconvnode(right).left).left).left:=nil;
-                exit;
-              end;
           end;
       end;
 

+ 157 - 113
compiler/x86/aoptx86.pas

@@ -8618,39 +8618,77 @@ unit aoptx86;
           begin
             if (taicpu(p).oper[0]^.typ = top_const) then
               begin
-                if (taicpu(hp1).opcode = A_AND) and
-                  MatchOpType(taicpu(hp1),top_const,top_reg) and
-                  (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
-                  { the second register must contain the first one, so compare their subreg types }
-                  (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
-                  (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
-                  { change
-                      and const1, reg
-                      and const2, reg
-                    to
-                      and (const1 and const2), reg
-                  }
-                  begin
-                    taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
-                    DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
-                    RemoveCurrentP(p, hp1);
-                    Result:=true;
-                    exit;
-                  end
-                else if (taicpu(hp1).opcode = A_MOVZX) and
-                  MatchOpType(taicpu(hp1),top_reg,top_reg) and
-                  SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
-                  (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
-                   (((taicpu(p).opsize=S_W) and
-                     (taicpu(hp1).opsize=S_BW)) or
-                    ((taicpu(p).opsize=S_L) and
-                     (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}]))
+                case taicpu(hp1).opcode of
+                  A_AND:
+                    if MatchOpType(taicpu(hp1),top_const,top_reg) and
+                      (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
+                      { the second register must contain the first one, so compare their subreg types }
+                      (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
+                      (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
+                      { change
+                          and const1, reg
+                          and const2, reg
+                        to
+                          and (const1 and const2), reg
+                      }
+                      begin
+                        taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
+                        DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
+                        RemoveCurrentP(p, hp1);
+                        Result:=true;
+                        exit;
+                      end;
+
+                  A_CMP:
+                    if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
+                      MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
+                      MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
+                      { Just check that the condition on the next instruction is compatible }
+                      GetNextInstruction(hp1, hp2) and
+                      (hp2.typ = ait_instruction) and
+                      (taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
+                      then
+                        { change
+                            and  2^n, reg
+                            cmp  2^n, reg
+                            j(c) / set(c) / cmov(c)   (c is equal or not equal)
+                          to
+                            and  2^n, reg
+                            test reg, reg
+                            j(~c) / set(~c) / cmov(~c)
+                        }
+                      begin
+                        { Keep TEST instruction in, rather than remove it, because
+                          it may trigger other optimisations such as MovAndTest2Test }
+                        taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
+                        taicpu(hp1).opcode := A_TEST;
+                        DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
+                        taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
+                        Result := True;
+                        Exit;
+                      end;
+
+                  A_MOVZX:
+                    if MatchOpType(taicpu(hp1),top_reg,top_reg) and
+                      SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
+                      (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
+                      (
+                        (
+                          (taicpu(p).opsize=S_W) and
+                          (taicpu(hp1).opsize=S_BW)
+                        ) or
+                        (
+                          (taicpu(p).opsize=S_L) and
+                          (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
+                        )
 {$ifdef x86_64}
-                      or
-                     ((taicpu(p).opsize=S_Q) and
-                      (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL]))
+                        or
+                        (
+                          (taicpu(p).opsize=S_Q) and
+                          (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
+                        )
 {$endif x86_64}
-                    ) then
+                      ) then
                       begin
                         if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
                             ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
@@ -8673,108 +8711,114 @@ unit aoptx86;
                             { See if there are other optimisations possible }
                             Continue;
                           end;
-                      end
-                else if (taicpu(hp1).opcode = A_SHL) and
-                  MatchOpType(taicpu(hp1),top_const,top_reg) and
-                  (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
-                  begin
+                      end;
+
+                  A_SHL:
+                    if MatchOpType(taicpu(hp1),top_const,top_reg) and
+                      (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
+                      begin
 {$ifopt R+}
 {$define RANGE_WAS_ON}
 {$R-}
 {$endif}
-                    { get length of potential and mask }
-                    MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
+                        { get length of potential and mask }
+                        MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
 
-                    { really a mask? }
+                        { really a mask? }
 {$ifdef RANGE_WAS_ON}
 {$R+}
 {$endif}
-                    if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
-                      { unmasked part shifted out? }
-                      ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
-                      begin
-                        DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
-                        RemoveCurrentP(p, hp1);
-                        Result:=true;
-                        exit;
+                        if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
+                          { unmasked part shifted out? }
+                          ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
+                          begin
+                            DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
+                            RemoveCurrentP(p, hp1);
+                            Result:=true;
+                            exit;
+                          end;
                       end;
-                  end
-                else if (taicpu(hp1).opcode = A_SHR) and
-                  MatchOpType(taicpu(hp1),top_const,top_reg) and
-                  (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
-                  (taicpu(hp1).oper[0]^.val <= 63) then
-                  begin
-                    { Does SHR combined with the AND cover all the bits?
 
-                      e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
+                  A_SHR:
+                    if MatchOpType(taicpu(hp1),top_const,top_reg) and
+                      (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
+                      (taicpu(hp1).oper[0]^.val <= 63) then
+                      begin
+                        { Does SHR combined with the AND cover all the bits?
 
-                    MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
+                          e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
 
-                    if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
-                      ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
-                      ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
-                      begin
-                        DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
-                        RemoveCurrentP(p, hp1);
-                        Result := True;
-                        Exit;
+                        MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
+
+                        if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
+                          ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
+                          ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
+                          begin
+                            DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
+                            RemoveCurrentP(p, hp1);
+                            Result := True;
+                            Exit;
+                          end;
                       end;
-                  end
-                else if ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}) and
-                  (taicpu(hp1).oper[0]^.typ = top_reg) and
-                  SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
-                    begin
-                      if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
-                        (
+
+                  A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
+                    if (taicpu(hp1).oper[0]^.typ = top_reg) and
+                      SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
+                      begin
+                        if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
                           (
-                            (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
-                            ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
-                          ) or (
-                            (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
-                            ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
+                            (
+                              (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
+                              ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
+                            ) or (
+                              (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
+                              ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
 {$ifdef x86_64}
-                          ) or (
-                            (taicpu(hp1).opsize = S_LQ) and
-                            ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
+                            ) or (
+                              (taicpu(hp1).opsize = S_LQ) and
+                              ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
 {$endif x86_64}
-                          )
-                        ) then
-                        begin
-                          if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
-                            begin
-                              DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
-                              RemoveInstruction(hp1);
-                              { See if there are other optimisations possible }
-                              Continue;
-                            end;
+                            )
+                          ) then
+                          begin
+                            if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
+                              begin
+                                DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
+                                RemoveInstruction(hp1);
+                                { See if there are other optimisations possible }
+                                Continue;
+                              end;
 
-                          { The super-registers are the same though.
+                            { The super-registers are the same though.
 
-                            Note that this change by itself doesn't improve
-                            code speed, but it opens up other optimisations. }
+                              Note that this change by itself doesn't improve
+                              code speed, but it opens up other optimisations. }
 {$ifdef x86_64}
-                          { Convert 64-bit register to 32-bit }
-                          case taicpu(hp1).opsize of
-                            S_BQ:
-                              begin
-                                taicpu(hp1).opsize := S_BL;
-                                taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
-                              end;
-                            S_WQ:
-                              begin
-                                taicpu(hp1).opsize := S_WL;
-                                taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
-                              end
-                            else
-                              ;
-                          end;
+                            { Convert 64-bit register to 32-bit }
+                            case taicpu(hp1).opsize of
+                              S_BQ:
+                                begin
+                                  taicpu(hp1).opsize := S_BL;
+                                  taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
+                                end;
+                              S_WQ:
+                                begin
+                                  taicpu(hp1).opsize := S_WL;
+                                  taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
+                                end
+                              else
+                                ;
+                            end;
 {$endif x86_64}
-                          DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
-                          taicpu(hp1).opcode := A_MOVZX;
-                          { See if there are other optimisations possible }
-                          Continue;
-                        end;
-                    end;
+                            DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
+                            taicpu(hp1).opcode := A_MOVZX;
+                            { See if there are other optimisations possible }
+                            Continue;
+                          end;
+                      end;
+                  else
+                    ;
+                end;
               end;
 
             if (taicpu(hp1).is_jmp) and

+ 3 - 3
rtl/x86_64/math.inc

@@ -295,7 +295,7 @@ const
             fnstcw oldcw
             fldt d
             movw oldcw,%cx
-            orw $0x0c3f,%cx
+            orw $0x0c00,%cx
             movw %cx,newcw
             fldcw newcw
             fld %st
@@ -315,7 +315,7 @@ const
       asm
             fnstcw oldcw
             movw oldcw,%cx
-            orw $0x0c3f,%cx
+            orw $0x0c00,%cx
             movw %cx,newcw
             fldcw newcw
             fldt d
@@ -336,7 +336,7 @@ const
       asm
         fnstcw oldcw
         movw oldcw,%cx
-        orw $0x0c3f,%cx
+        orw $0x0c00,%cx
         movw %cx,newcw
         fldcw newcw
         fldt d