* a_op_const_reg_reg optimizations

* added some more 64 bit PPC opcodes * removed last mwpascal references * added replacement of division/modulo by constant by multiplications and shifts for 64 bit * general cleanup git-svn-id: trunk@1648 -
2025-08-19 01:09:13 +02:00 · 2005-11-04 22:49:05 +00:00 · 2005-11-04 22:49:05 +00:00 · e4a61f4af1
commit e4a61f4af1
parent 4bd32a686d
9 changed files with 524 additions and 215 deletions
--- a/compiler/powerpc64/cgcpu.pas
+++ b/compiler/powerpc64/cgcpu.pas
@ -157,16 +157,9 @@ type
  end;
 const
-  TOpCG2AsmOpConstLo: array[topcg] of TAsmOp = (A_NONE, A_ADDI, A_ANDI_,
+  TShiftOpCG2AsmOpConst : array[boolean, OP_SAR..OP_SHR] of TAsmOp = (
-    A_DIVWU,
+    (A_SRAWI, A_SLWI, A_SRWI), (A_SRADI, A_SLDI, A_SRDI)
-    A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE, A_ORI,
+    );
    A_SRAWI, A_SLWI, A_SRWI, A_SUBI, A_XORI);
  TOpCG2AsmOpConstHi: array[topcg] of TAsmOp = (A_NONE, A_ADDIS, A_ANDIS_,
    A_DIVWU, A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE,
    A_ORIS, A_NONE, A_NONE, A_NONE, A_SUBIS, A_XORIS);
  TShiftOpCG2AsmOpConst32 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRAWI, A_SLWI, A_SRWI);
  TShiftOpCG2AsmOpConst64 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRADI, A_SLDI, A_SRDI);
  TOpCmp2AsmCond: array[topcmp] of TAsmCondFlag = (C_NONE, C_EQ, C_GT,
    C_LT, C_GE, C_LE, C_NE, C_LE, C_LT, C_GE, C_GT);
@ -248,10 +241,13 @@ begin
              location^.register)
          else
            { load non-integral sized memory location into register. This 
-            memory location be 1-sizeleft byte sized.
+             memory location be 1-sizeleft byte sized.
-            Always assume that this memory area is properly aligned, eg. start
+             Always assume that this memory area is properly aligned, eg. start
-            loading the larger quantities for "odd" quantities first }
+             loading the larger quantities for "odd" quantities first }
            case sizeleft of
              1,2,4,8 :
                a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref,
                  location^.register); 
              3 : begin
                a_reg_alloc(list, NR_R12); 
                a_load_ref_reg(list, OS_16, location^.size, tmpref, 
@ -259,7 +255,7 @@ begin
                inc(tmpref.offset, tcgsize2size[OS_16]);
                a_load_ref_reg(list, OS_8, location^.size, tmpref,
                  location^.register);
-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40));                
+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40));
                a_reg_dealloc(list, NR_R12);
              end;
              5 : begin
@ -267,8 +263,8 @@ begin
                a_load_ref_reg(list, OS_32, location^.size, tmpref, NR_R12);
                inc(tmpref.offset, tcgsize2size[OS_32]);
                a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register);
-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24));                
+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24));
-                a_reg_dealloc(list, NR_R12);              
+                a_reg_dealloc(list, NR_R12);
              end;
              6 : begin
                a_reg_alloc(list, NR_R12);
@ -286,20 +282,16 @@ begin
                a_load_ref_reg(list, OS_16, location^.size, tmpref, NR_R0);
                inc(tmpref.offset, tcgsize2size[OS_16]);
                a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register);
-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16));                
+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16));
-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8));                 
+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8));
                a_reg_dealloc(list, NR_R0);
                a_reg_dealloc(list, NR_R12);
              end;
-              1,2,4,8 :
+              else
-                a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref,
+                { still > 8 bytes to load, so load data single register now }
                  location^.register); 
              else 
                a_load_ref_reg(list, location^.size, location^.size, tmpref,
                  location^.register);
            end; 
 //            a_load_ref_reg(list, location^.size, location^.size, tmpref,
 //              location^.register);
        end;
      LOC_REFERENCE:
        begin
@ -368,12 +360,8 @@ begin
    AT_FUNCTION)));
  if (addNOP) then
    list.concat(taicpu.op_none(A_NOP));
-  {
+  { the compiler does not properly set this flag anymore in pass 1, and
-         the compiler does not properly set this flag anymore in pass 1, and
+   for now we only need it after pass 2 (I hope) (JM) }
         for now we only need it after pass 2 (I hope) (JM)
           if not(pi_do_call in current_procinfo.flags) then
             internalerror(2003060703);
  }
  include(current_procinfo.flags, pi_do_call);
 end;
@ -503,9 +491,9 @@ begin
         32 bits should contain -1
        - loading the lower 32 bits resulted in 0 in the upper 32 bits, and the upper
         32 bits should contain 0 }
-      load32bitconstantR0(list, size, hi(a), NR_R0);
+      load32bitconstant(list, size, hi(a), NR_R12);
      { combine both registers }
-      list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R0, 32, 0));
+      list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R12, 32, 0));
    end;
  end;
 end;
@ -550,7 +538,7 @@ const
    ((A_LBZ, A_LBZU), (A_LBZX, A_LBZUX)),
    ((A_LHA, A_LHAU), (A_LHAX, A_LHAUX)),
    { there's no load-word-arithmetic-indexed with update, simulate it in code :( }
-    ((A_LWA, A_LWAU), (A_LWAX, A_LWAUX)),
+    ((A_LWA, A_NOP), (A_LWAX, A_LWAUX)),
    ((A_LD, A_LDU), (A_LDX, A_LDUX))
    );
 var
@ -563,12 +551,12 @@ begin
  ref2 := ref;
  fixref(list, ref2, tosize);
  { the caller is expected to have adjusted the reference already
-   in this case                                                  }
+   in this case }
  if (TCGSize2Size[fromsize] >= TCGSize2Size[tosize]) then
    fromsize := tosize;
  op := loadinstr[fromsize, ref2.index <> NR_NO, false];
  { there is no LWAU instruction, simulate using ADDI and LWA }
-  if (op = A_LWAU) then begin
+  if (op = A_NOP) then begin
    list.concat(taicpu.op_reg_reg_const(A_ADDI, reg, reg, ref2.offset));
    ref2.offset := 0;
    op := A_LWA;
@ -605,8 +593,8 @@ var
 begin
  op := movemap[fromsize, tosize];
  case op of
-        A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1);
+    A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1);
-        A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8);
+    A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8);
  else
    internalerror(2002090901);
  end;
@ -614,8 +602,8 @@ begin
  rg[R_INTREGISTER].add_move_instruction(instr);
 end;
-procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; reg1, reg2:
+procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; 
-  tregister);
+  reg1, reg2: tregister);
 var
  instr: taicpu;
 begin
@ -624,8 +612,8 @@ begin
  rg[R_FPUREGISTER].add_move_instruction(instr);
 end;
-procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; const ref:
+procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; 
-  treference; reg: tregister);
+  const ref: treference; reg: tregister);
 const
  FpuLoadInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp =
  { indexed? updating?}
@ -654,7 +642,6 @@ end;
 procedure tcgppc.a_loadfpu_reg_ref(list: taasmoutput; size: tcgsize; reg:
  tregister; const ref: treference);
 const
  FpuStoreInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp =
  { indexed? updating? }
@ -688,139 +675,131 @@ end;
 procedure tcgppc.a_op_const_reg_reg(list: taasmoutput; op: TOpCg;
  size: tcgsize; a: aint; src, dst: tregister);
 var
  l1, l2: longint;
  oplo, ophi: tasmop;
  scratchreg: tregister;
  useReg : boolean;
  shiftmask : longint;
-  procedure do_lo_hi;
+  procedure do_lo_hi(loOp, hiOp : TAsmOp);
  begin
    { Optimization for logical ops (excluding AND), trying to do this as efficiently
     as possible by only generating code for the affected halfwords. Note that all
     the instructions handled here must have "X op 0 = X" for every halfword. }
    usereg := false;
-    if (size in [OS_64, OS_S64]) then begin
+    if (aword(a) > high(dword)) then begin
      { ts: use register method for 64 bit consts. Sloooooow }
      usereg := true;
    end else if (size in [OS_32, OS_S32]) then begin
      list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a)));
      list.concat(taicpu.op_reg_reg_const(ophi, dst, dst, word(a shr 16)));
    end else begin
-      list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a)));
+      if (word(a) <> 0) then begin
        list.concat(taicpu.op_reg_reg_const(loOp, dst, src, word(a)));
        if (word(a shr 16) <> 0) then
          list.concat(taicpu.op_reg_reg_const(hiOp, dst, dst, word(a shr 16)));
      end else if (word(a shr 16) <> 0) then
        list.concat(taicpu.op_reg_reg_const(hiOp, dst, src, word(a shr 16)));
    end;
  end;
  procedure do_lo_hi_and;
  begin
    { optimization logical and with immediate: only use "andi." for 16 bit
     ands, otherwise use register method. Doing this for 32 bit constants
     would not give any advantage to the register method (via useReg := true), 
     requiring a scratch register and three instructions. }
    usereg := false;
    if (aword(a) > high(word)) then
      usereg := true
    else
      list.concat(taicpu.op_reg_reg_const(A_ANDI_, dst, src, word(a)));
  end;
 var
  scratchreg: tregister;
  shift, shiftmask : longint;
 begin
  { subtraction is the same as addition with negative constant }
  if op = OP_SUB then begin
    a_op_const_reg_reg(list, OP_ADD, size, -a, src, dst);
    exit;
  end;
-  ophi := TOpCG2AsmOpConstHi[op];
+  { This case includes some peephole optimizations for the various operations,
-  oplo := TOpCG2AsmOpConstLo[op];
+   (e.g. AND, OR, XOR, ..) - can't this be done at some higher level, 
-  { peephole optimizations for AND, OR, XOR - can't this be done at
+   independent of architecture? }
   some higher level, independent of architecture? }
  if (op in [OP_AND, OP_OR, OP_XOR]) then begin
    if (a = 0) then begin
      if op = OP_AND then
        list.concat(taicpu.op_reg_const(A_LI, dst, 0))
      else
        a_load_reg_reg(list, size, size, src, dst);
      exit;
    end else if (a = -1) then begin
      case op of
        OP_OR:
          list.concat(taicpu.op_reg_const(A_LI, dst, -1));
        OP_XOR:
          list.concat(taicpu.op_reg_reg(A_NOT, dst, src));
        OP_AND:
          a_load_reg_reg(list, size, size, src, dst);
      end;
      exit;
    end;
  { optimization for add }
  end else if (op = OP_ADD) then
    if a = 0 then begin
      a_load_reg_reg(list, size, size, src, dst);
      exit;
    end else if (a >= low(smallint)) and (a <= high(smallint)) then begin
      list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a)));
      exit;
    end;
-  { otherwise, the instructions we can generate depend on the operation }
+  { assume that we do not need a scratch register for the operation }
  useReg := false;
-  case op of
+  case (op) of
    OP_DIV, OP_IDIV:
      { actually, this method should be never called directly with OP_DIV or
       OP_IDIV, so just provide basic support.
       TODO: move division by constant stuff from nppcmat.pas here }    
      if (a = 0) then
        internalerror(200208103)
-      else if (a = 1) then begin
+      else if (a = 1) then
-        a_load_reg_reg(list, OS_INT, OS_INT, src, dst);
+        a_load_reg_reg(list, size, size, src, dst)
-        exit
+      else
-      end else if false {and ispowerof2(a, l1)} then begin
+        usereg := true; 
        internalerror(200208103);
        case op of
          OP_DIV: begin
            list.concat(taicpu.op_reg_reg_const(A_SRDI, dst, src, l1));
          end;
          OP_IDIV:
            begin
              list.concat(taicpu.op_reg_reg_const(A_SRADI, dst, src, l1));
              list.concat(taicpu.op_reg_reg(A_ADDZE, dst, dst));
            end;
        end;
        exit;
      end else
        usereg := true;
    OP_IMUL, OP_MUL:
-      if (a = 0) then begin
+      { idea: factorize constant multiplicands and use adds/shifts with few factors;
-        list.concat(taicpu.op_reg_const(A_LI, dst, 0));
+       however, even a 64 bit multiply is already quite fast on PPC64 }
-        exit
+      if (a = 0) then
-      end else if (a = -1) then begin
+        a_load_const_reg(list, size, 0, dst)
-        list.concat(taicpu.op_reg_reg(A_NEG, dst, dst));
+      else if (a = -1) then
-      end else if (a = 1) then begin
+        list.concat(taicpu.op_reg_reg(A_NEG, dst, dst))
-        a_load_reg_reg(list, OS_INT, OS_INT, src, dst);
+      else if (a = 1) then
-        exit
+        a_load_reg_reg(list, OS_INT, OS_INT, src, dst)
-      end else if ispowerof2(a, l1) then
+      else if ispowerof2(a, shift) then
-        list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, l1))
+        list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, shift))
      else if (a >= low(smallint)) and (a <= high(smallint)) then
        list.concat(taicpu.op_reg_reg_const(A_MULLI, dst, src,
          smallint(a)))
      else
        usereg := true;
    OP_ADD:
-      {$todo ts:optimize}
+      if (a = 0) then
-      useReg := true;
+        a_load_reg_reg(list, size, size, src, dst)
      else if (a >= low(smallint)) and (a <= high(smallint)) then
        list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a)))
      else
        useReg := true;
    OP_OR:
-      do_lo_hi;
+      if (a = 0) then
        a_load_reg_reg(list, size, size, src, dst)
      else if (a = -1) then
        a_load_const_reg(list, size, -1, dst)
      else
        do_lo_hi(A_ORI, A_ORIS);
    OP_AND:
-      useReg := true;
+      if (a = 0) then
        a_load_const_reg(list, size, 0, dst)
      else if (a = -1) then
        a_load_reg_reg(list, size, size, src, dst)
      else
        do_lo_hi_and;
    OP_XOR:
-      do_lo_hi;
+      if (a = 0) then
        a_load_reg_reg(list, size, size, src, dst)
      else if (a = -1) then
        list.concat(taicpu.op_reg_reg(A_NOT, dst, src))
      else
        do_lo_hi(A_XORI, A_XORIS);
    OP_SHL, OP_SHR, OP_SAR:
      begin
-        {$note ts: cleanup todo, fix remaining bugs}
+        if (size in [OS_64, OS_S64]) then 
-        if (size in [OS_64, OS_S64]) then begin
+          shift := 6
-          if (a and 63) <> 0 then
+        else
-            list.concat(taicpu.op_reg_reg_const(
+          shift := 5;
-              TShiftOpCG2AsmOpConst64[Op], dst, src, a and 63))
+        
-          else
+        shiftmask := (1 shl shift)-1;
-            a_load_reg_reg(list, size, size, src, dst);
+        if (a and shiftmask) <> 0 then
-          if (a shr 6) <> 0 then
+          list.concat(taicpu.op_reg_reg_const(
-            internalError(68991);
+            TShiftOpCG2AsmOpConst[size in [OS_64, OS_S64], op], dst, src, a and shiftmask))
-        end else begin
+        else
-          if (a and 31) <> 0 then
+          a_load_reg_reg(list, size, size, src, dst);
-            list.concat(taicpu.op_reg_reg_const(
+        if ((a shr shift) <> 0) then
-              TShiftOpCG2AsmOpConst32[Op], dst, src, a and 31))
+          internalError(68991);
          else
            a_load_reg_reg(list, size, size, src, dst);
          if (a shr 5) <> 0 then
            internalError(68991);
        end;
      end
-  else
+    else
-    internalerror(200109091);
+      internalerror(200109091);
  end;
-  { if all else failed, load the constant in a register and then }
+  { if all else failed, load the constant in a register and then
-  { perform the operation                                        }
+   perform the operation }
-  if useReg then begin
+  if (useReg) then begin
    scratchreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
    a_load_const_reg(list, size, a, scratchreg);
    a_op_reg_reg_reg(list, op, size, scratchreg, src, dst);
@ -843,35 +822,29 @@ begin
    OP_NEG, OP_NOT:
      begin
        list.concat(taicpu.op_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src1));
-        if (op = OP_NOT) and
+        if (op = OP_NOT) and not (size in [OS_64, OS_S64]) then
          not (size in [OS_64, OS_S64]) then
          { zero/sign extend result again, fromsize is not important here }
          a_load_reg_reg(list, OS_S64, size, dst, dst)
      end;
-  else
+    else
-  {$NOTE ts:testme}
+      if (size in [OS_64, OS_S64]) then begin
-    if (size in [OS_64, OS_S64]) then begin
+        list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2,
-      list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2,
+          src1));
-        src1));
+      end else begin
-    end else begin
+        list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2,
-      list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2,
+          src1));
-        src1));
+      end;
    end;
  end;
 end;
 {*************** compare instructructions ****************}
-procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; cmp_op:
+procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; 
-  topcmp; a: aint; reg: tregister;
+  cmp_op: topcmp; a: aint; reg: tregister; l: tasmlabel);
  l: tasmlabel);
 var
  scratch_register: TRegister;
  signed: boolean;
 begin
  { todo: use 32 bit compares? }
  signed := cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE];
  { in the following case, we generate more efficient code when }
  { signed is true                                              }
@ -897,13 +870,10 @@ begin
  a_jmp(list, A_BC, TOpCmp2AsmCond[cmp_op], 0, l);
 end;
-procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; cmp_op:
+procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; 
-  topcmp;
+  cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);
  reg1, reg2: tregister; l: tasmlabel);
 var
  op: tasmop;
 begin
  if cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE] then
    if (size in [OS_64, OS_S64]) then
@ -953,11 +923,9 @@ end;
 procedure tcgppc.g_flags2reg(list: taasmoutput; size: TCgSize; const f:
  TResFlags; reg: TRegister);
 var
  testbit: byte;
  bitvalue: boolean;
 begin
  { get the bit to extract from the conditional register + its requested value (0 or 1) }
  testbit := ((f.cr - RS_CR0) * 4);
@ -1375,7 +1343,7 @@ begin
    list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8));
    countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
    a_load_const_reg(list, OS_32, count, countreg);
-    { explicitely allocate R_0 since it can be used safely here
+    { explicitely allocate F0 since it can be used safely here
     (for holding date that's being copied) }
    a_reg_alloc(list, NR_F0);
    objectlibrary.getjumplabel(lab);
--- a/compiler/powerpc64/cpubase.pas
+++ b/compiler/powerpc64/cpubase.pas
@ -94,7 +94,7 @@ type
    A_SRDI, A_SRADI,
    A_SLDI,
    A_RLDICL,
-    A_DIVDU, A_DIVD, A_MULLD, A_SRAD, A_SLD, A_SRD,
+    A_DIVDU, A_DIVDU_, A_DIVD, A_DIVD_, A_MULLD, A_MULLD_, A_MULHD, A_MULHD_, A_SRAD, A_SLD, A_SRD,
    A_DIVDUO_, A_DIVDO_,
    A_LWA, A_LWAU, A_LWAX, A_LWAUX,
    A_FCFID,
--- a/compiler/powerpc64/cpuinfo.pas
+++ b/compiler/powerpc64/cpuinfo.pas
@ -49,9 +49,7 @@ const
    { the difference to stdcall is only the name mangling }
    pocall_cdecl,
    { the difference to stdcall is only the name mangling }
-    pocall_cppdecl,
+    pocall_cppdecl
    { pass all const records by reference }
    pocall_mwpascal
    ];
  processorsstr: array[tprocessors] of string[10] = ('',
--- a/compiler/powerpc64/cpupara.pas
+++ b/compiler/powerpc64/cpupara.pas
@ -408,10 +408,11 @@ begin
      end;
    end;
  end;
  curintreg := nextintreg;
  curfloatreg := nextfloatreg;
  curmmreg := nextmmreg;
-  cur_stack_offset := stack_offset;
+  cur_stack_offset := stack_offset; 
  result := stack_offset;
 end;
--- a/compiler/powerpc64/cpupi.pas
+++ b/compiler/powerpc64/cpupi.pas
@ -66,7 +66,7 @@ var
 begin
  if not (po_assembler in procdef.procoptions) then begin
    { align the stack properly }
-    ofs := align(maxpushedparasize + LinkageAreaSizeELF, ELF_STACK_ALIGN);
+    ofs := align(maxpushedparasize + LinkageAreaSizeELF, 8);
    { the ABI specification says that it is required to always allocate space for 8 * 8 bytes
      for registers R3-R10 and stack header if there's a stack frame, but GCC doesn't do that,
@ -74,7 +74,6 @@ begin
 //    if (ofs < 112) then begin
 //      ofs := 112;
 //    end;
    tg.setfirsttemp(ofs);
  end else begin
    locals := 0;
--- a/compiler/powerpc64/itcpugas.pas
+++ b/compiler/powerpc64/itcpugas.pas
@ -84,7 +84,7 @@ const
    'srdi', 'sradi',
    'sldi',
    'rldicl',
-    'divdu', 'divd', 'mulld', 'srad', 'sld', 'srd',
+    'divdu', 'divdu.', 'divd', 'divd.', 'mulld', 'mulld.', 'mulhd', 'mulhd.', 'srad', 'sld', 'srd',
    'divduo.', 'divdo.',
    'lwa', '<illegal lwau>', 'lwax', 'lwaux',
    'fcfid',
--- a/compiler/powerpc64/nppcadd.pas
+++ b/compiler/powerpc64/nppcadd.pas
@ -163,7 +163,6 @@ begin
  end
 end;
 // Todo: ts: allow emiting word compares...
 procedure tppcaddnode.emit_compare(unsigned: boolean);
 var
  op: tasmop;
@ -175,8 +174,7 @@ begin
    swapleftright;
  // can we use an immediate, or do we have to load the
  // constant in a register first?
-  if (right.location.loc = LOC_CONSTANT) then
+  if (right.location.loc = LOC_CONSTANT) then begin
  begin
    if (nodetype in [equaln, unequaln]) then
      if (unsigned and
        (aword(right.location.value) > high(word))) or
@ -193,15 +191,13 @@ begin
      (aint(right.location.value) >= low(smallint)) and
      (aint(right.location.value) <= high(smallint))) then
      useconst := true
-    else
+    else begin
    begin
      useconst := false;
      tmpreg := cg.getintregister(exprasmlist, OS_INT);
      cg.a_load_const_reg(exprasmlist, OS_INT,
        right.location.value, tmpreg);
    end
-  end
+  end else
  else
    useconst := false;
  location.loc := LOC_FLAGS;
  location.resflags := getresflags;
@ -215,15 +211,13 @@ begin
  else
    op := A_CMPLD;
-  if (right.location.loc = LOC_CONSTANT) then
+  if (right.location.loc = LOC_CONSTANT) then begin
  begin
    if useconst then
      exprasmlist.concat(taicpu.op_reg_const(op, left.location.register,
        longint(right.location.value)))
    else
      exprasmlist.concat(taicpu.op_reg_reg(op, left.location.register, tmpreg));
-  end
+  end else
  else
    exprasmlist.concat(taicpu.op_reg_reg(op,
      left.location.register, right.location.register));
 end;
@ -237,7 +231,7 @@ var
  cgop: TOpCg;
  cgsize: TCgSize;
  cmpop,
-    isjump: boolean;
+  isjump: boolean;
  otl, ofl: tasmlabel;
 begin
  { calculate the operator which is more difficult }
@ -525,7 +519,6 @@ begin
            cg.a_op_reg_reg(exprasmlist, OP_SHL, OS_64,
              right.location.register, tmpreg);
            if left.location.loc <> LOC_CONSTANT then begin
              cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_64, tmpreg,
                left.location.register, location.register)
            end else begin
--- a/compiler/powerpc64/nppcinl.pas
+++ b/compiler/powerpc64/nppcinl.pas
@ -36,6 +36,8 @@ type
    }
    function first_abs_real: tnode; override;
    function first_sqr_real: tnode; override;
    { todo: inline trunc/round/frac?/int }
    procedure second_abs_real; override;
    procedure second_sqr_real; override;
    procedure second_prefetch; override;
--- a/compiler/powerpc64/nppcmat.pas
+++ b/compiler/powerpc64/nppcmat.pas
@ -59,6 +59,176 @@ uses
  cpubase, cpuinfo,
  ncgutil, cgcpu, rgobj;
 { helper functions }
 procedure getmagic_unsigned32(d : dword; out magic_m : dword; out magic_add : boolean; out magic_shift : dword);
 var
    p : longint;
    nc, delta, q1, r1, q2, r2 : dword;
 begin
    assert(d > 0);
    magic_add := false;
    nc := - 1 - (-d) mod d;
    p := 31; { initialize p }
    q1 := $80000000 div nc; { initialize q1 = 2p/nc }
    r1 := $80000000 - q1*nc; { initialize r1 = rem(2p,nc) }
    q2 := $7FFFFFFF div d; { initialize q2 = (2p-1)/d }
    r2 := $7FFFFFFF - q2*d; { initialize r2 = rem((2p-1),d) }
    repeat
        inc(p);
        if (r1 >= (nc - r1)) then begin
            q1 := 2 * q1 + 1; { update q1 }
            r1 := 2*r1 - nc; { update r1 }
        end else begin
            q1 := 2*q1; { update q1 }
            r1 := 2*r1; { update r1 }
        end;
        if ((r2 + 1) >= (d - r2)) then begin
            if (q2 >= $7FFFFFFF) then
                magic_add := true;
            q2 := 2*q2 + 1; { update q2 }
            r2 := 2*r2 + 1 - d; { update r2 }
        end else begin
            if (q2 >= $80000000) then 
                magic_add := true;
            q2 := 2*q2; { update q2 }
            r2 := 2*r2 + 1; { update r2 }
        end;
        delta := d - 1 - r2;
    until not ((p < 64) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
    magic_m := q2 + 1; { resulting magic number }
    magic_shift := p - 32; { resulting shift }
 end;
 procedure getmagic_signed32(d : longint; out magic_m : longint; out magic_s : longint);
 const
    two_31 : DWord = high(longint)+1;
 var
    p : Longint;
    ad, anc, delta, q1, r1, q2, r2, t : DWord;
 begin
    assert((d < -1) or (d > 1));
    ad := abs(d);
    t := two_31 + (DWord(d) shr 31);
    anc := t - 1 - t mod ad; { absolute value of nc }
    p := 31; { initialize p }
    q1 := two_31 div anc; { initialize q1 = 2p/abs(nc) }
    r1 := two_31 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
    q2 := two_31 div ad; { initialize q2 = 2p/abs(d) }
    r2 := two_31 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
    repeat 
        inc(p);
        q1 := 2*q1; { update q1 = 2p/abs(nc) }
        r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
        if (r1 >= anc) then begin { must be unsigned comparison }
            inc(q1);
            dec(r1, anc);
        end;
        q2 := 2*q2; { update q2 = 2p/abs(d) }
        r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
        if (r2 >= ad) then begin { must be unsigned comparison }
            inc(q2);
            dec(r2, ad);
        end;
        delta := ad - r2;
    until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
    magic_m := q2 + 1;
    if (d < 0) then begin
        magic_m := -magic_m; { resulting magic number }
    end;
    magic_s := p - 32; { resulting shift }
 end;
 { helper functions }
 procedure getmagic_unsigned64(d : qword; out magic_m : qword; out magic_add : boolean; out magic_shift : qword);
 const
  two_63 : QWord = $8000000000000000;  
 var
    p : int64;
    nc, delta, q1, r1, q2, r2 : qword;
 begin
  assert(d > 0);
  magic_add := false;
  nc := - 1 - (-d) mod d;
  p := 63; { initialize p }
  q1 := two_63 div nc; { initialize q1 = 2p/nc }
  r1 := two_63 - q1*nc; { initialize r1 = rem(2p,nc) }
  q2 := (two_63-1) div d; { initialize q2 = (2p-1)/d }
  r2 := (two_63-1) - q2*d; { initialize r2 = rem((2p-1),d) }
  repeat
    inc(p);
    if (r1 >= (nc - r1)) then begin
      q1 := 2 * q1 + 1; { update q1 }
      r1 := 2*r1 - nc; { update r1 }
    end else begin
      q1 := 2*q1; { update q1 }
      r1 := 2*r1; { update r1 }
    end;
    if ((r2 + 1) >= (d - r2)) then begin
      if (q2 >= (two_63-1)) then
        magic_add := true;
      q2 := 2*q2 + 1; { update q2 }
      r2 := 2*r2 + 1 - d; { update r2 }
    end else begin
      if (q2 >= two_63) then 
        magic_add := true;
      q2 := 2*q2; { update q2 }
      r2 := 2*r2 + 1; { update r2 }
    end;
    delta := d - 1 - r2;
  until not ((p < 128) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
  magic_m := q2 + 1; { resulting magic number }
  magic_shift := p - 64; { resulting shift }
 end;
 procedure getmagic_signed64(d : int64; out magic_m : int64; out magic_s : int64);
 const
  two_63 : QWord = $8000000000000000;  
 var
  p : int64;
  ad, anc, delta, q1, r1, q2, r2, t : QWord;
 begin
  assert((d < -1) or (d > 1));
  ad := abs(d);
  t := two_63 + (QWord(d) shr 63);
  anc := t - 1 - t mod ad; { absolute value of nc }
  p := 63; { initialize p }
  q1 := two_63 div anc; { initialize q1 = 2p/abs(nc) }
  r1 := two_63 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
  q2 := two_63 div ad; { initialize q2 = 2p/abs(d) }
  r2 := two_63 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
  repeat 
    inc(p);
    q1 := 2*q1; { update q1 = 2p/abs(nc) }
    r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
    if (r1 >= anc) then begin { must be unsigned comparison }
      inc(q1);
      dec(r1, anc);
    end;
    q2 := 2*q2; { update q2 = 2p/abs(d) }
    r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
    if (r2 >= ad) then begin { must be unsigned comparison }
      inc(q2);
      dec(r2, ad);
    end;
    delta := ad - r2;
  until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
  magic_m := q2 + 1;
  if (d < 0) then begin
    magic_m := -magic_m; { resulting magic number }
  end;
  magic_s := p - 64; { resulting shift }
 end;
 {*****************************************************************************
                             TPPCMODDIVNODE
 *****************************************************************************}
@ -70,6 +240,200 @@ begin
    include(current_procinfo.flags, pi_do_call);
 end;
 procedure tppcmoddivnode.pass_2;
 const         { signed   overflow }
  divops: array[boolean, boolean] of tasmop =
    ((A_DIVDU,A_DIVDU_),(A_DIVD,A_DIVDO_));
  zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond:C_NE; cr: RS_CR7);
 var
  power  : longint;
  op  : tasmop;
  numerator, divider,
  resultreg  : tregister;
  size       : TCgSize;
  hl : tasmlabel;
  done: boolean;
  procedure genOrdConstNodeDiv;
  const
    negops : array[boolean] of tasmop = (A_NEG, A_NEGO);
  var
    magic, shift : int64;
    u_magic, u_shift : qword;
    u_add : boolean;
    divreg : tregister;
  begin
    if (tordconstnode(right).value = 0) then begin
      internalerror(2005061701);
    end else if (tordconstnode(right).value = 1) then begin
      cg.a_load_reg_reg(exprasmlist, OS_INT, OS_INT, numerator, resultreg);
    end else if (tordconstnode(right).value = -1) then begin
      { note: only in the signed case possible..., may overflow }
      exprasmlist.concat(taicpu.op_reg_reg(negops[cs_check_overflow in aktlocalswitches], resultreg, numerator));
    end else if (ispowerof2(tordconstnode(right).value, power)) then begin
      if (is_signed(right.resulttype.def)) then begin
        { From "The PowerPC Compiler Writer's Guide", pg. 52ff          }
        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, power,
        numerator, resultreg);
        exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, resultreg, resultreg));
      end else begin
        cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg)
      end;
    end else begin
      { replace division by multiplication, both implementations }
      { from "The PowerPC Compiler Writer's Guide" pg. 53ff      }
      divreg := cg.getintregister(exprasmlist, OS_INT);
      if (is_signed(right.resulttype.def)) then begin
        getmagic_signed64(tordconstnode(right).value, magic, shift);
        { load magic value }
        cg.a_load_const_reg(exprasmlist, OS_INT, magic, divreg);
        { multiply }
        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHD, resultreg, numerator, divreg));
        { add/subtract numerator }
        if (tordconstnode(right).value > 0) and (magic < 0) then begin
          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, numerator, resultreg, resultreg);
        end else if (tordconstnode(right).value < 0) and (magic > 0) then begin
          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, numerator, resultreg, resultreg);
        end;
        { shift shift places to the right (arithmetic) }
        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, shift, resultreg, resultreg);                     
        { extract and add sign bit }
        if (tordconstnode(right).value >= 0) then begin
          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, numerator, divreg);
        end else begin
          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, resultreg, divreg);
        end;                     
        cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, resultreg, divreg, resultreg);
      end else begin
        getmagic_unsigned64(tordconstnode(right).value, u_magic, u_add, u_shift);
        { load magic in divreg }
        cg.a_load_const_reg(exprasmlist, OS_INT, u_magic, divreg);
        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHDU, resultreg, numerator, divreg));
        if (u_add) then begin
          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, divreg);
          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT,  1, divreg, divreg);
          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, divreg, resultreg, divreg);
          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift-1, divreg, resultreg);
        end else begin
          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift, resultreg, resultreg);
        end;
      end;
    end;
    done := true;
  end;
  procedure genOrdConstNodeMod;
  var
    modreg, maskreg, tempreg : tregister;
  begin
    if (tordconstnode(right).value = 0) then begin
      internalerror(2005061702);
    end else if (abs(tordconstnode(right).value) = 1) then begin
      { x mod +/-1 is always zero }
      cg.a_load_const_reg(exprasmlist, OS_INT, 0, resultreg);
    end else if (ispowerof2(tordconstnode(right).value, power)) then begin
      if (is_signed(right.resulttype.def)) then begin
        tempreg := cg.getintregister(exprasmlist, OS_INT);
        maskreg := cg.getintregister(exprasmlist, OS_INT);
        modreg := cg.getintregister(exprasmlist, OS_INT);
        cg.a_load_const_reg(exprasmlist, OS_INT, abs(tordconstnode(right).value)-1, modreg);
        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, 63, numerator, maskreg);
        cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, numerator, modreg, tempreg);
        exprasmlist.concat(taicpu.op_reg_reg_reg(A_ANDC, maskreg, maskreg, modreg));
        exprasmlist.concat(taicpu.op_reg_reg_const(A_SUBFIC, modreg, tempreg, 0));
        exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUBFE, modreg, modreg, modreg));
        cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, modreg, maskreg, maskreg);
        cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_INT, maskreg, tempreg, resultreg);
      end else begin
        cg.a_op_const_reg_reg(exprasmlist, OP_AND, OS_INT, tordconstnode(right).value-1, numerator, resultreg);
      end;
    end else begin
      genOrdConstNodeDiv();
      cg.a_op_const_reg_reg(exprasmlist, OP_MUL, OS_INT, tordconstnode(right).value, resultreg, resultreg);
      cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, resultreg);
    end;
  end;
 begin
  secondpass(left);
  secondpass(right);
  location_copy(location,left.location);
  { put numerator in register }
  size:=def_cgsize(left.resulttype.def);
  location_force_reg(exprasmlist,left.location,
    size,true);
  location_copy(location,left.location);
  numerator := location.register;
  resultreg := location.register;
  if (location.loc = LOC_CREGISTER) then begin
    location.loc := LOC_REGISTER;
    location.register := cg.getintregister(exprasmlist,size);
    resultreg := location.register;
  end else if (nodetype = modn) or (right.nodetype = ordconstn) then begin
    { for a modulus op, and for const nodes we need the result register
     to be an extra register }
    resultreg := cg.getintregister(exprasmlist,size);
  end;
  done := false;
 (*
  if (right.nodetype = ordconstn) then begin
    if (nodetype = divn) then
      genOrdConstNodeDiv
    else
      genOrdConstNodeMod;
    done := true;
  end;
 *)
  if (not done) then begin
    { load divider in a register if necessary }
    location_force_reg(exprasmlist,right.location,
      def_cgsize(right.resulttype.def),true);
    if (right.nodetype <> ordconstn) then
      exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR7,
        right.location.register, 0))
    else begin
      if (tordconstnode(right).value = 0) then 
        internalerror(2005100301);
    end;
    divider := right.location.register;
    { needs overflow checking, (-maxlongint-1) div (-1) overflows! }
    op := divops[is_signed(right.resulttype.def),
      cs_check_overflow in aktlocalswitches];
    exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
      divider));
    if (nodetype = modn) then begin
      exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD,resultreg,
          divider,resultreg));
      exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB,location.register,
        numerator,resultreg));
      resultreg := location.register;
    end;
  end;
  { set result location }
  location.loc:=LOC_REGISTER;
  location.register:=resultreg;
  if right.nodetype <> ordconstn then begin
    objectlibrary.getjumplabel(hl);
    exprasmlist.concat(taicpu.op_cond_sym(A_BC,zerocond,hl));
    cg.a_call_name(exprasmlist,'FPC_DIVBYZERO');
    cg.a_label(exprasmlist,hl);
  end;
  { unsigned division/module can only overflow in case of division by zero
   (but checking this overflow flag is more convoluted than performing a  
   simple comparison with 0)                                             }
  if is_signed(right.resulttype.def) then
    cg.g_overflowcheck(exprasmlist,location,resulttype.def);
 end;
 (*
 procedure tppcmoddivnode.pass_2;
 const
  // ts: todo, use 32 bit operations if possible (much faster!)
@ -130,9 +494,7 @@ begin
    end else begin
      cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg);
    end;
-  end
+  end else begin
  else
  begin
    { load divider in a register if necessary }
    location_force_reg(exprasmlist, right.location,
      def_cgsize(right.resulttype.def), true);
@ -150,8 +512,7 @@ begin
    exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
      divider));
-    if (nodetype = modn) then
+    if (nodetype = modn) then begin
    begin
 {$NOTE ts:testme}
      exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD, resultreg,
        divider, resultreg));
@ -163,8 +524,7 @@ begin
  { set result location }
  location.loc := LOC_REGISTER;
  location.register := resultreg;
-  if right.nodetype <> ordconstn then
+  if (right.nodetype <> ordconstn) then begin
  begin
    objectlibrary.getjumplabel(hl);
    exprasmlist.concat(taicpu.op_cond_sym(A_BC, zerocond, hl));
    cg.a_call_name(exprasmlist, 'FPC_DIVBYZERO');
@ -172,7 +532,7 @@ begin
  end;
  cg.g_overflowcheck(exprasmlist, location, resulttype.def);
 end;
-
+*)
 {*****************************************************************************
                             TPPCSHLRSHRNODE
 *****************************************************************************}
@ -181,8 +541,8 @@ end;
 procedure tppcshlshrnode.pass_2;
 var
-  resultreg, hregister1, hregister2,
+  resultreg, hregister1, hregister2 : tregister;
-    hreg64hi, hreg64lo: tregister;
+  
  op: topcg;
  asmop1, asmop2: tasmop;
  shiftval: aint;
@ -199,7 +559,7 @@ begin
  hregister1 := location.register;
  if (location.loc = LOC_CREGISTER) then begin
    location.loc := LOC_REGISTER;
-    resultreg := cg.getintregister(exprasmlist, OS_64);
+    resultreg := cg.getintregister(exprasmlist, OS_INT);
    location.register := resultreg;
  end;
@ -257,17 +617,14 @@ begin
        end;
      LOC_REFERENCE, LOC_CREFERENCE:
        begin
-          if (left.resulttype.def.deftype = floatdef) then
+          if (left.resulttype.def.deftype = floatdef) then begin
          begin
            src1 := cg.getfpuregister(exprasmlist,
              def_cgsize(left.resulttype.def));
            location.register := src1;
            cg.a_loadfpu_ref_reg(exprasmlist,
              def_cgsize(left.resulttype.def),
              left.location.reference, src1);
-          end
+          end else begin
          else
          begin
            src1 := cg.getintregister(exprasmlist, OS_64);
            location.register := src1;
            cg.a_load_ref_reg(exprasmlist, OS_64, OS_64,
@ -276,28 +633,19 @@ begin
        end;
    end;
    { choose appropriate operand }
-    if left.resulttype.def.deftype <> floatdef then
+    if left.resulttype.def.deftype <> floatdef then begin
    begin
      if not (cs_check_overflow in aktlocalswitches) then
        op := A_NEG
      else
        op := A_NEGO_;
      location.loc := LOC_REGISTER;
-    end
+    end else begin
    else
    begin
      op := A_FNEG;
      location.loc := LOC_FPUREGISTER;
    end;
    { emit operation }
    exprasmlist.concat(taicpu.op_reg_reg(op, location.register, src1));
  end;
  { Here was a problem...     }
  { Operand to be negated always     }
  { seems to be converted to signed  }
  { 32-bit before doing neg!!     }
  { So this is useless...     }
  { that's not true: -2^31 gives an overflow error if it is negated (FK) }
  cg.g_overflowcheck(exprasmlist, location, resulttype.def);
 end;