| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500 | {    Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal    Development Team    This unit implements the PowerPC optimizer object    This program is free software; you can redistribute it and/or modify    it under the terms of the GNU General Public License as published by    the Free Software Foundation; either version 2 of the License, or    (at your option) any later version.    This program is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    GNU General Public License for more details.    You should have received a copy of the GNU General Public License    along with this program; if not, write to the Free Software    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ****************************************************************************}Unit aoptcpu;Interface{$i fpcdefs.inc}uses cpubase, aoptobj, aoptcpub, aopt, aasmtai,aasmdata, aasmcpu;Type  TCpuAsmOptimizer = class(TAsmOptimizer)    { uses the same constructor as TAopObj }    function PeepHoleOptPass1Cpu(var p: tai): boolean; override;    function PostPeepHoleOptsCpu(var p: tai): boolean; override;   private     function cmpi_mfcr_opt(p, next1, next2: taicpu): boolean;  End;Implementation  uses    cutils, verbose, cgbase, cgcpu, cgobj;  function TCpuAsmOptimizer.cmpi_mfcr_opt(p, next1, next2: taicpu): boolean;    var      next3, prev: tai;      inverse, prevrlwinm: boolean;    begin      result := true;      inverse :=        getnextinstruction(next2,next3) and        (next3.typ = ait_instruction) and        (taicpu(next3).opcode = A_XORI) and        (taicpu(next3).oper[0]^.reg = taicpu(next3).oper[1]^.reg) and        (taicpu(next3).oper[0]^.reg = taicpu(next2).oper[0]^.reg) and        (taicpu(next3).oper[2]^.val = 1);      case taicpu(next2).oper[2]^.val of        1:         begin           // less than zero or greater/equal than zero (the xori remains in           // in the latter case). Doesn't make sense for unsigned comparisons.           if (p.opcode = A_CMPWI) then             begin               p.opcode := A_SRWI;               p.ops := 3;               p.loadreg(1,p.oper[0]^.reg);               p.loadreg(0,next1.oper[0]^.reg);               p.loadconst(2,31);               asml.remove(next1);               next1.free;               asml.remove(next2);               next2.free;             end           else             result := false;         end;{    needs two registers to work with        2:         begin           // greater or less/equal to zero         end;}        3:         begin           prevrlwinm :=             getlastinstruction(p,prev) and             (prev.typ = ait_instruction) and             ((taicpu(prev).opcode = A_RLWINM) or              (taicpu(prev).opcode = A_RLWINM_)) and             (taicpu(prev).oper[0]^.reg = p.oper[0]^.reg) and             (taicpu(prev).oper[3]^.val = taicpu(prev).oper[4]^.val);           if (prevrlwinm) then             begin               // isolate the bit we need               if (taicpu(prev).oper[3]^.val <> 31) then                 begin                   p.opcode := A_RLWINM;                   p.ops := 5;                   p.loadreg(1,p.oper[0]^.reg);                   p.loadreg(0,next1.oper[0]^.reg);                   p.loadconst(2,taicpu(prev).oper[3]^.val + 1);                   p.loadconst(3,31);                   p.loadconst(4,31);                 end               else { if (taicpu(prev).oper[0]^.reg <> next1.oper[0]^.reg) then }                 begin                   p.opcode := A_MR;                   p.loadreg(1,p.oper[0]^.reg);                   p.loadreg(0,next1.oper[0]^.reg);                 end;               if not inverse then                 begin                   next1.ops := 3;                   next1.opcode := A_XORI;                   next1.loadreg(1,next1.oper[0]^.reg);                   next1.loadconst(2,1);                 end               else                 begin                   asml.remove(next1);                   next1.free;                   asml.remove(next3);                   next3.free;                 end;               asml.remove(next2);               next2.free;             end           else             begin                // equal/not equal to zero (the xori remains in the latter case;                // there's a more optimal sequence without it, but needs extra                // register)                p.opcode := A_CNTLZW;                p.loadreg(1,p.oper[0]^.reg);                p.loadreg(0,next1.oper[0]^.reg);                next1.ops := 3;                next1.opcode := A_SRWI;                next1.loadreg(1,next1.oper[0]^.reg);                next1.loadconst(2,5);                asml.remove(next2);                next2.free;              end;         end;        else          result := false;      end;    end;  function rlwinm2mask(l1,l2: longint): longint;    begin       // 1 shl 32 = 1 instead of 0 on x86      if (l1 <> 0) then        result :=  longint(cardinal(1) shl (32 - l1) - 1) xor (cardinal(1) shl (31 - l2) - 1)      else        result := longint(not(cardinal(1) shl (31 - l2) - 1));      if (l1 > l2) then        result := not(result);    end;  function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;    var      next1, next2: tai;      l1, l2, shlcount: longint;    begin      result := false;      case p.typ of        ait_instruction:          begin            case taicpu(p).opcode of              A_CMPWI,              A_CMPLWI:                begin                  if (taicpu(p).oper[1]^.typ = top_const) and                     (taicpu(p).oper[1]^.val = 0) and                     getnextinstruction(p,next1) and                     (next1.typ = ait_instruction) and                     (taicpu(next1).opcode = A_MFCR) and                     getnextinstruction(next1,next2) and                     (taicpu(next2).opcode = A_RLWINM) and                     (taicpu(next2).oper[0]^.reg = taicpu(next2).oper[1]^.reg) and                     (taicpu(next2).oper[0]^.reg = taicpu(next1).oper[0]^.reg) and                     (taicpu(next2).oper[3]^.val = 31) and                     (taicpu(next2).oper[4]^.val = 31) and                     cmpi_mfcr_opt(taicpu(p),taicpu(next1),taicpu(next2)) then                    result := true;                end;{ seems the register allocator doesn't generate superfluous fmr's }{              A_FMR, }              A_MR:                begin                  if getnextinstruction(p,next1) and                     (next1.typ = ait_instruction) and                     (taicpu(next1).ops >= 1) and                     { spilling_get_operation_type does not support lmw/stmw }                     (taicpu(next1).opcode <> A_LMW) and                     (taicpu(next1).opcode <> A_STMW) and                     (taicpu(next1).spilling_get_operation_type(0) = operand_write) and                     (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) then                    begin                      for l1 := 1 to taicpu(next1).ops - 1 do                        case taicpu(next1).oper[l1]^.typ of                          top_reg:                            if taicpu(next1).oper[l1]^.reg = taicpu(p).oper[0]^.reg then                              taicpu(next1).loadreg(l1,taicpu(p).oper[1]^.reg);                          top_ref:                            begin                              if taicpu(next1).oper[l1]^.ref^.base = taicpu(p).oper[0]^.reg then                                taicpu(next1).oper[l1]^.ref^.base := taicpu(p).oper[1]^.reg;                              if taicpu(next1).oper[l1]^.ref^.index = taicpu(p).oper[0]^.reg then                                taicpu(next1).oper[l1]^.ref^.index := taicpu(p).oper[1]^.reg;                            end;                        end;                      asml.remove(p);                      p.free;                      p := next1;                      result := true;                    end;                end;              A_SLWI:                begin                  if getnextinstruction(p,next1) and                     (next1.typ = ait_instruction) and                     ((taicpu(next1).opcode = A_RLWINM) or                      (taicpu(next1).opcode = A_SLWI) or                      (taicpu(next1).opcode = A_SRWI)) and                     (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and                     (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then                    begin                      { convert slwi to rlwinm and see if the rlwinm }                      { optimization can do something with it        }                      taicpu(p).opcode := A_RLWINM;                      taicpu(p).ops := 5;                      taicpu(p).loadconst(2,taicpu(p).oper[2]^.val);                      taicpu(p).loadconst(3,0);                      taicpu(p).loadconst(4,31-taicpu(p).oper[2]^.val);                      result := true;                    end;                end;              A_SRWI:                begin                  if getnextinstruction(p,next1) and                     (next1.typ = ait_instruction) and                     ((taicpu(next1).opcode = A_SLWI) or                      (taicpu(next1).opcode = A_RLWINM) or                      (taicpu(next1).opcode = A_SRWI)) and                     (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and                     (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then                    case taicpu(next1).opcode of                      A_SLWI:                        begin                          taicpu(p).opcode := A_RLWINM;                          taicpu(p).ops := 5;                          taicpu(p).loadconst(2,taicpu(next1).oper[2]^.val-taicpu(p).oper[2]^.val);                          if (taicpu(p).oper[2]^.val < 0) then                            begin                              taicpu(p).loadconst(3,-taicpu(p).oper[2]^.val);                              taicpu(p).loadconst(4,31-taicpu(next1).oper[2]^.val);                              inc(taicpu(p).oper[2]^.val,32);                            end                          else                            begin                              taicpu(p).loadconst(3,0);                              taicpu(p).loadconst(4,31-taicpu(next1).oper[2]^.val);                            end;                          asml.remove(next1);                          next1.free;                          result := true;                        end;                      A_RLWINM:                        begin                          { convert srwi to rlwinm and see if the rlwinm }                          { optimization can do something with it        }                          taicpu(p).opcode := A_RLWINM;                          taicpu(p).ops := 5;                          taicpu(p).loadconst(3,taicpu(p).oper[2]^.val);                          taicpu(p).loadconst(4,31);                          taicpu(p).loadconst(2,(32-taicpu(p).oper[2]^.val) and 31);                          result := true;                        end;                    end;                end;              A_RLWINM:                begin                  if getnextinstruction(p,next1) and                     (next1.typ = ait_instruction) and                     ((taicpu(next1).opcode = A_RLWINM) or                      (taicpu(next1).opcode = A_SRWI) or                      (taicpu(next1).opcode = A_SLWI)) and                     (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and                     // both source and target of next1 must equal target of p                     (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then                    begin                      case taicpu(next1).opcode of                        A_RLWINM:                          begin                            shlcount := taicpu(next1).oper[2]^.val;                            l2 := rlwinm2mask(taicpu(next1).oper[3]^.val,taicpu(next1).oper[4]^.val);                          end;                        A_SLWI:                          begin                            shlcount := taicpu(next1).oper[2]^.val;                            l2 := (-1) shl shlcount;                          end;                        A_SRWI:                          begin                            shlcount := 32-taicpu(next1).oper[2]^.val;                            l2 := (-1) shr taicpu(next1).oper[2]^.val;                          end;                        else                          internalerror(2013113008);                      end;                      l1 := rlwinm2mask((taicpu(p).oper[3]^.val-shlcount) and 31,(taicpu(p).oper[4]^.val-shlcount) and 31);                      l1 := l1 and l2;                      case l1 of                        -1:                          begin                            taicpu(p).oper[2]^.val := (taicpu(p).oper[2]^.val + shlcount) and 31;                            asml.remove(next1);                            next1.free;                            if (taicpu(p).oper[2]^.val = 0) then                              begin                                next1 := tai(p.next);                                asml.remove(p);                                p.free;                                p := next1;                                result := true;                              end;                          end;                        0:                          begin                            // masks have no bits in common                            taicpu(p).opcode := A_LI;                            taicpu(p).loadconst(1,0);                            taicpu(p).freeop(2);                            taicpu(p).freeop(3);                            taicpu(p).freeop(4);                            taicpu(p).ops := 2;                            taicpu(p).opercnt := 2;                            asml.remove(next1);                            next1.free;                            result := true;                          end                        else if tcgppc(cg).get_rlwi_const(l1,l1,l2) then                          begin                            taicpu(p).oper[2]^.val := (taicpu(p).oper[2]^.val + shlcount) and 31;                            taicpu(p).oper[3]^.val := l1;                            taicpu(p).oper[4]^.val := l2;                            asml.remove(next1);                            next1.free;                            result := true;                          end;                      end;                    end;                end;            end;          end;      end;    end;  const    modifyflags: array[tasmop] of tasmop =      (a_none, a_add_, a_add_, a_addo_, a_addo_, a_addc_, a_addc_, a_addco_, a_addco_,      a_adde_, a_adde_, a_addeo_, a_addeo_, {a_addi could be addic_ if sure doesn't disturb carry} a_none, a_addic_, a_addic_, a_none,      a_addme_, a_addme_, a_addmeo_, a_addmeo_, a_addze_, a_addze_, a_addzeo_,      a_addzeo_, a_and_, a_and_, a_andc_, a_andc_, a_andi_, a_andis_, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_cntlzw_, a_cntlzw_, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_divw_, a_divw_, a_divwo_, a_divwo_,      a_divwu_, a_divwu_, a_divwuo_, a_divwuo_, a_none, a_none, a_none, a_eqv_,      a_eqv_, a_extsb_, a_extsb_, a_extsh_, a_extsh_, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_mffs, a_mffs_, a_mfmsr, a_mfspr, a_mfsr,      a_mfsrin, a_mftb, a_mtcrf, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_mulhw_,      a_mulhw_, a_mulhwu_, a_mulhwu_, a_none, a_mullw_, a_mullw_, a_mullwo_,      a_mullwo_, a_nand_, a_nand_, a_neg_, a_neg_, a_nego_, a_nego_, a_nor_, a_nor_,      a_or_, a_or_, a_orc_, a_orc_, a_none, a_none, a_none, a_rlwimi_, a_rlwimi_,      a_rlwinm_, a_rlwinm_, a_rlwnm_, a_rlwnm_, a_none, a_slw_, a_slw_, a_sraw_, a_sraw_,      a_srawi_, a_srawi_,a_srw_, a_srw_, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none, a_subf_, a_subf_, a_subfo_,      a_subfo_, a_subfc_, a_subfc_, a_subfco_, a_subfco_, a_subfe_, a_subfe_,      a_subfeo_, a_subfeo_, a_none, a_subfme_, a_subfme_, a_subfmeo_, a_subfmeo_,      a_subfze_, a_subfze_, a_subfzeo_, a_subfzeo_, a_none, a_none, a_none,      a_none, a_none, a_none, a_xor_, a_xor_, a_none, a_none,      { simplified mnemonics }      a_none, a_none, a_subic_, a_subic_, a_sub_, a_sub_, a_subo_, a_subo_,      a_subc_, a_subc_, a_subco_, a_subco_, a_none, a_none, a_none, a_none,      a_extlwi_, a_extlwi_, a_extrwi_, a_extrwi_, a_inslwi_, a_inslwi_, a_insrwi_,      a_insrwi_, a_rotlwi_, a_rotlwi_, a_rotlw_, a_rotlw_, a_slwi_, a_slwi_,      a_srwi_, a_srwi_, a_clrlwi_, a_clrlwi_, a_clrrwi_, a_clrrwi_, a_clrslwi_,      a_clrslwi_, a_none, a_none, a_none, a_none, a_none, a_none, a_none,      a_none, a_none {move to special purpose reg}, a_none {move from special purpose reg},      a_none, a_none, a_none, a_none, a_mr_, a_mr_, a_not_, a_not_, a_none, a_none, a_none,      a_none, a_none, a_none, a_none,      a_none, a_none, a_none, a_none, a_none);  function changetomodifyflags(p: taicpu): boolean;    begin      result := false;      if (modifyflags[p.opcode] <> a_none) then        begin          p.opcode := modifyflags[p.opcode];          result := true;        end;    end;  function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;    var      next1: tai;    begin      result := false;      case p.typ of        ait_instruction:          begin            case taicpu(p).opcode of              A_RLWINM_:                begin                  // rlwinm_ is cracked on the G5, andi_/andis_ aren't                  if (taicpu(p).oper[2]^.val = 0) then                    if (taicpu(p).oper[3]^.val < 16) and                       (taicpu(p).oper[4]^.val < 16) then                      begin                        taicpu(p).opcode := A_ANDIS_;                        taicpu(p).oper[2]^.val := word(                          ((1 shl (16-taicpu(p).oper[3]^.val)) - 1) xor                          ((1 shl (15-taicpu(p).oper[4]^.val)) - 1));                        taicpu(p).freeop(3);                        taicpu(p).freeop(4);                        taicpu(p).ops := 3;                        taicpu(p).opercnt := 3;                      end                    else if (taicpu(p).oper[3]^.val >= 16) and                       (taicpu(p).oper[4]^.val >= 16) then                      begin                        taicpu(p).opcode := A_ANDI_;                        taicpu(p).oper[2]^.val := word(rlwinm2mask(taicpu(p).oper[3]^.val,taicpu(p).oper[4]^.val));                        taicpu(p).freeop(3);                        taicpu(p).freeop(4);                        taicpu(p).ops := 3;                        taicpu(p).opercnt := 3;                      end;                end;            end;            // change "integer operation with destination reg" followed by a            // comparison to zero of that reg, with a variant of that integer            // operation which sets the flags (if it exists)            if not(result) and               (taicpu(p).ops >= 2) and               (taicpu(p).oper[0]^.typ = top_reg) and               (taicpu(p).oper[1]^.typ = top_reg) and               getnextinstruction(p,next1) and               (next1.typ = ait_instruction) and               (taicpu(next1).opcode = A_CMPWI) and               // make sure it the result goes to cr0               (((taicpu(next1).ops = 2) and                 (taicpu(next1).oper[1]^.val = 0) and                 (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg)) or                ((taicpu(next1).ops = 3) and                 (taicpu(next1).oper[2]^.val = 0) and                 (taicpu(next1).oper[0]^.typ = top_reg) and                 (getsupreg(taicpu(next1).oper[0]^.reg) = RS_CR0) and                 (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg))) and               changetomodifyflags(taicpu(p)) then              begin                asml.remove(next1);                next1.free;                result := true;              end;          end;      end;    end;begin  casmoptimizer:=TCpuAsmOptimizer;End.
 |