From e4a61f4af17030886a7922c1953717c36e1039b9 Mon Sep 17 00:00:00 2001 From: tom_at_work Date: Fri, 4 Nov 2005 22:49:05 +0000 Subject: [PATCH] * a_op_const_reg_reg optimizations * added some more 64 bit PPC opcodes * removed last mwpascal references * added replacement of division/modulo by constant by multiplications and shifts for 64 bit * general cleanup git-svn-id: trunk@1648 - --- compiler/powerpc64/cgcpu.pas | 302 +++++++++++------------- compiler/powerpc64/cpubase.pas | 2 +- compiler/powerpc64/cpuinfo.pas | 4 +- compiler/powerpc64/cpupara.pas | 3 +- compiler/powerpc64/cpupi.pas | 3 +- compiler/powerpc64/itcpugas.pas | 2 +- compiler/powerpc64/nppcadd.pas | 19 +- compiler/powerpc64/nppcinl.pas | 2 + compiler/powerpc64/nppcmat.pas | 402 +++++++++++++++++++++++++++++--- 9 files changed, 524 insertions(+), 215 deletions(-) diff --git a/compiler/powerpc64/cgcpu.pas b/compiler/powerpc64/cgcpu.pas index e165234f51..24d40df15d 100644 --- a/compiler/powerpc64/cgcpu.pas +++ b/compiler/powerpc64/cgcpu.pas @@ -157,16 +157,9 @@ type end; const - TOpCG2AsmOpConstLo: array[topcg] of TAsmOp = (A_NONE, A_ADDI, A_ANDI_, - A_DIVWU, - A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE, A_ORI, - A_SRAWI, A_SLWI, A_SRWI, A_SUBI, A_XORI); - TOpCG2AsmOpConstHi: array[topcg] of TAsmOp = (A_NONE, A_ADDIS, A_ANDIS_, - A_DIVWU, A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE, - A_ORIS, A_NONE, A_NONE, A_NONE, A_SUBIS, A_XORIS); - - TShiftOpCG2AsmOpConst32 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRAWI, A_SLWI, A_SRWI); - TShiftOpCG2AsmOpConst64 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRADI, A_SLDI, A_SRDI); + TShiftOpCG2AsmOpConst : array[boolean, OP_SAR..OP_SHR] of TAsmOp = ( + (A_SRAWI, A_SLWI, A_SRWI), (A_SRADI, A_SLDI, A_SRDI) + ); TOpCmp2AsmCond: array[topcmp] of TAsmCondFlag = (C_NONE, C_EQ, C_GT, C_LT, C_GE, C_LE, C_NE, C_LE, C_LT, C_GE, C_GT); @@ -248,10 +241,13 @@ begin location^.register) else { load non-integral sized memory location into register. This - memory location be 1-sizeleft byte sized. - Always assume that this memory area is properly aligned, eg. start - loading the larger quantities for "odd" quantities first } + memory location be 1-sizeleft byte sized. + Always assume that this memory area is properly aligned, eg. start + loading the larger quantities for "odd" quantities first } case sizeleft of + 1,2,4,8 : + a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref, + location^.register); 3 : begin a_reg_alloc(list, NR_R12); a_load_ref_reg(list, OS_16, location^.size, tmpref, @@ -259,7 +255,7 @@ begin inc(tmpref.offset, tcgsize2size[OS_16]); a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register); - list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40)); + list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40)); a_reg_dealloc(list, NR_R12); end; 5 : begin @@ -267,8 +263,8 @@ begin a_load_ref_reg(list, OS_32, location^.size, tmpref, NR_R12); inc(tmpref.offset, tcgsize2size[OS_32]); a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register); - list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24)); - a_reg_dealloc(list, NR_R12); + list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24)); + a_reg_dealloc(list, NR_R12); end; 6 : begin a_reg_alloc(list, NR_R12); @@ -286,20 +282,16 @@ begin a_load_ref_reg(list, OS_16, location^.size, tmpref, NR_R0); inc(tmpref.offset, tcgsize2size[OS_16]); a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register); - list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16)); - list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8)); + list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16)); + list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8)); a_reg_dealloc(list, NR_R0); a_reg_dealloc(list, NR_R12); end; - 1,2,4,8 : - a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref, - location^.register); - else + else + { still > 8 bytes to load, so load data single register now } a_load_ref_reg(list, location^.size, location^.size, tmpref, location^.register); end; -// a_load_ref_reg(list, location^.size, location^.size, tmpref, -// location^.register); end; LOC_REFERENCE: begin @@ -368,12 +360,8 @@ begin AT_FUNCTION))); if (addNOP) then list.concat(taicpu.op_none(A_NOP)); - { - the compiler does not properly set this flag anymore in pass 1, and - for now we only need it after pass 2 (I hope) (JM) - if not(pi_do_call in current_procinfo.flags) then - internalerror(2003060703); - } + { the compiler does not properly set this flag anymore in pass 1, and + for now we only need it after pass 2 (I hope) (JM) } include(current_procinfo.flags, pi_do_call); end; @@ -503,9 +491,9 @@ begin 32 bits should contain -1 - loading the lower 32 bits resulted in 0 in the upper 32 bits, and the upper 32 bits should contain 0 } - load32bitconstantR0(list, size, hi(a), NR_R0); + load32bitconstant(list, size, hi(a), NR_R12); { combine both registers } - list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R0, 32, 0)); + list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R12, 32, 0)); end; end; end; @@ -550,7 +538,7 @@ const ((A_LBZ, A_LBZU), (A_LBZX, A_LBZUX)), ((A_LHA, A_LHAU), (A_LHAX, A_LHAUX)), { there's no load-word-arithmetic-indexed with update, simulate it in code :( } - ((A_LWA, A_LWAU), (A_LWAX, A_LWAUX)), + ((A_LWA, A_NOP), (A_LWAX, A_LWAUX)), ((A_LD, A_LDU), (A_LDX, A_LDUX)) ); var @@ -563,12 +551,12 @@ begin ref2 := ref; fixref(list, ref2, tosize); { the caller is expected to have adjusted the reference already - in this case } + in this case } if (TCGSize2Size[fromsize] >= TCGSize2Size[tosize]) then fromsize := tosize; op := loadinstr[fromsize, ref2.index <> NR_NO, false]; { there is no LWAU instruction, simulate using ADDI and LWA } - if (op = A_LWAU) then begin + if (op = A_NOP) then begin list.concat(taicpu.op_reg_reg_const(A_ADDI, reg, reg, ref2.offset)); ref2.offset := 0; op := A_LWA; @@ -605,8 +593,8 @@ var begin op := movemap[fromsize, tosize]; case op of - A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1); - A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8); + A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1); + A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8); else internalerror(2002090901); end; @@ -614,8 +602,8 @@ begin rg[R_INTREGISTER].add_move_instruction(instr); end; -procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; reg1, reg2: - tregister); +procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; + reg1, reg2: tregister); var instr: taicpu; begin @@ -624,8 +612,8 @@ begin rg[R_FPUREGISTER].add_move_instruction(instr); end; -procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; const ref: - treference; reg: tregister); +procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; + const ref: treference; reg: tregister); const FpuLoadInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp = { indexed? updating?} @@ -654,7 +642,6 @@ end; procedure tcgppc.a_loadfpu_reg_ref(list: taasmoutput; size: tcgsize; reg: tregister; const ref: treference); - const FpuStoreInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp = { indexed? updating? } @@ -688,139 +675,131 @@ end; procedure tcgppc.a_op_const_reg_reg(list: taasmoutput; op: TOpCg; size: tcgsize; a: aint; src, dst: tregister); var - l1, l2: longint; - oplo, ophi: tasmop; - scratchreg: tregister; useReg : boolean; - shiftmask : longint; - procedure do_lo_hi; + procedure do_lo_hi(loOp, hiOp : TAsmOp); begin + { Optimization for logical ops (excluding AND), trying to do this as efficiently + as possible by only generating code for the affected halfwords. Note that all + the instructions handled here must have "X op 0 = X" for every halfword. } usereg := false; - if (size in [OS_64, OS_S64]) then begin - { ts: use register method for 64 bit consts. Sloooooow } + if (aword(a) > high(dword)) then begin usereg := true; - end else if (size in [OS_32, OS_S32]) then begin - list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a))); - list.concat(taicpu.op_reg_reg_const(ophi, dst, dst, word(a shr 16))); end else begin - list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a))); + if (word(a) <> 0) then begin + list.concat(taicpu.op_reg_reg_const(loOp, dst, src, word(a))); + if (word(a shr 16) <> 0) then + list.concat(taicpu.op_reg_reg_const(hiOp, dst, dst, word(a shr 16))); + end else if (word(a shr 16) <> 0) then + list.concat(taicpu.op_reg_reg_const(hiOp, dst, src, word(a shr 16))); end; end; + procedure do_lo_hi_and; + begin + { optimization logical and with immediate: only use "andi." for 16 bit + ands, otherwise use register method. Doing this for 32 bit constants + would not give any advantage to the register method (via useReg := true), + requiring a scratch register and three instructions. } + usereg := false; + if (aword(a) > high(word)) then + usereg := true + else + list.concat(taicpu.op_reg_reg_const(A_ANDI_, dst, src, word(a))); + end; +var + scratchreg: tregister; + shift, shiftmask : longint; + begin + { subtraction is the same as addition with negative constant } if op = OP_SUB then begin a_op_const_reg_reg(list, OP_ADD, size, -a, src, dst); exit; end; - ophi := TOpCG2AsmOpConstHi[op]; - oplo := TOpCG2AsmOpConstLo[op]; - { peephole optimizations for AND, OR, XOR - can't this be done at - some higher level, independent of architecture? } - if (op in [OP_AND, OP_OR, OP_XOR]) then begin - if (a = 0) then begin - if op = OP_AND then - list.concat(taicpu.op_reg_const(A_LI, dst, 0)) - else - a_load_reg_reg(list, size, size, src, dst); - exit; - end else if (a = -1) then begin - case op of - OP_OR: - list.concat(taicpu.op_reg_const(A_LI, dst, -1)); - OP_XOR: - list.concat(taicpu.op_reg_reg(A_NOT, dst, src)); - OP_AND: - a_load_reg_reg(list, size, size, src, dst); - end; - exit; - end; - { optimization for add } - end else if (op = OP_ADD) then - if a = 0 then begin - a_load_reg_reg(list, size, size, src, dst); - exit; - end else if (a >= low(smallint)) and (a <= high(smallint)) then begin - list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a))); - exit; - end; + { This case includes some peephole optimizations for the various operations, + (e.g. AND, OR, XOR, ..) - can't this be done at some higher level, + independent of architecture? } - { otherwise, the instructions we can generate depend on the operation } + { assume that we do not need a scratch register for the operation } useReg := false; - case op of + case (op) of OP_DIV, OP_IDIV: + { actually, this method should be never called directly with OP_DIV or + OP_IDIV, so just provide basic support. + TODO: move division by constant stuff from nppcmat.pas here } if (a = 0) then internalerror(200208103) - else if (a = 1) then begin - a_load_reg_reg(list, OS_INT, OS_INT, src, dst); - exit - end else if false {and ispowerof2(a, l1)} then begin - internalerror(200208103); - case op of - OP_DIV: begin - list.concat(taicpu.op_reg_reg_const(A_SRDI, dst, src, l1)); - end; - OP_IDIV: - begin - list.concat(taicpu.op_reg_reg_const(A_SRADI, dst, src, l1)); - list.concat(taicpu.op_reg_reg(A_ADDZE, dst, dst)); - end; - end; - exit; - end else - usereg := true; + else if (a = 1) then + a_load_reg_reg(list, size, size, src, dst) + else + usereg := true; OP_IMUL, OP_MUL: - if (a = 0) then begin - list.concat(taicpu.op_reg_const(A_LI, dst, 0)); - exit - end else if (a = -1) then begin - list.concat(taicpu.op_reg_reg(A_NEG, dst, dst)); - end else if (a = 1) then begin - a_load_reg_reg(list, OS_INT, OS_INT, src, dst); - exit - end else if ispowerof2(a, l1) then - list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, l1)) + { idea: factorize constant multiplicands and use adds/shifts with few factors; + however, even a 64 bit multiply is already quite fast on PPC64 } + if (a = 0) then + a_load_const_reg(list, size, 0, dst) + else if (a = -1) then + list.concat(taicpu.op_reg_reg(A_NEG, dst, dst)) + else if (a = 1) then + a_load_reg_reg(list, OS_INT, OS_INT, src, dst) + else if ispowerof2(a, shift) then + list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, shift)) else if (a >= low(smallint)) and (a <= high(smallint)) then list.concat(taicpu.op_reg_reg_const(A_MULLI, dst, src, smallint(a))) else usereg := true; OP_ADD: - {$todo ts:optimize} - useReg := true; + if (a = 0) then + a_load_reg_reg(list, size, size, src, dst) + else if (a >= low(smallint)) and (a <= high(smallint)) then + list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a))) + else + useReg := true; OP_OR: - do_lo_hi; + if (a = 0) then + a_load_reg_reg(list, size, size, src, dst) + else if (a = -1) then + a_load_const_reg(list, size, -1, dst) + else + do_lo_hi(A_ORI, A_ORIS); OP_AND: - useReg := true; + if (a = 0) then + a_load_const_reg(list, size, 0, dst) + else if (a = -1) then + a_load_reg_reg(list, size, size, src, dst) + else + do_lo_hi_and; OP_XOR: - do_lo_hi; + if (a = 0) then + a_load_reg_reg(list, size, size, src, dst) + else if (a = -1) then + list.concat(taicpu.op_reg_reg(A_NOT, dst, src)) + else + do_lo_hi(A_XORI, A_XORIS); OP_SHL, OP_SHR, OP_SAR: begin - {$note ts: cleanup todo, fix remaining bugs} - if (size in [OS_64, OS_S64]) then begin - if (a and 63) <> 0 then - list.concat(taicpu.op_reg_reg_const( - TShiftOpCG2AsmOpConst64[Op], dst, src, a and 63)) - else - a_load_reg_reg(list, size, size, src, dst); - if (a shr 6) <> 0 then - internalError(68991); - end else begin - if (a and 31) <> 0 then - list.concat(taicpu.op_reg_reg_const( - TShiftOpCG2AsmOpConst32[Op], dst, src, a and 31)) - else - a_load_reg_reg(list, size, size, src, dst); - if (a shr 5) <> 0 then - internalError(68991); - end; + if (size in [OS_64, OS_S64]) then + shift := 6 + else + shift := 5; + + shiftmask := (1 shl shift)-1; + if (a and shiftmask) <> 0 then + list.concat(taicpu.op_reg_reg_const( + TShiftOpCG2AsmOpConst[size in [OS_64, OS_S64], op], dst, src, a and shiftmask)) + else + a_load_reg_reg(list, size, size, src, dst); + if ((a shr shift) <> 0) then + internalError(68991); end - else - internalerror(200109091); + else + internalerror(200109091); end; - { if all else failed, load the constant in a register and then } - { perform the operation } - if useReg then begin + { if all else failed, load the constant in a register and then + perform the operation } + if (useReg) then begin scratchreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE); a_load_const_reg(list, size, a, scratchreg); a_op_reg_reg_reg(list, op, size, scratchreg, src, dst); @@ -843,35 +822,29 @@ begin OP_NEG, OP_NOT: begin list.concat(taicpu.op_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src1)); - if (op = OP_NOT) and - not (size in [OS_64, OS_S64]) then + if (op = OP_NOT) and not (size in [OS_64, OS_S64]) then { zero/sign extend result again, fromsize is not important here } a_load_reg_reg(list, OS_S64, size, dst, dst) end; - else - {$NOTE ts:testme} - if (size in [OS_64, OS_S64]) then begin - list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2, - src1)); - end else begin - list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2, - src1)); - end; + else + if (size in [OS_64, OS_S64]) then begin + list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2, + src1)); + end else begin + list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2, + src1)); + end; end; end; {*************** compare instructructions ****************} -procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; cmp_op: - topcmp; a: aint; reg: tregister; - l: tasmlabel); - +procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; + cmp_op: topcmp; a: aint; reg: tregister; l: tasmlabel); var scratch_register: TRegister; signed: boolean; - begin - { todo: use 32 bit compares? } signed := cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE]; { in the following case, we generate more efficient code when } { signed is true } @@ -897,13 +870,10 @@ begin a_jmp(list, A_BC, TOpCmp2AsmCond[cmp_op], 0, l); end; -procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; cmp_op: - topcmp; - reg1, reg2: tregister; l: tasmlabel); - +procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; + cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel); var op: tasmop; - begin if cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE] then if (size in [OS_64, OS_S64]) then @@ -953,11 +923,9 @@ end; procedure tcgppc.g_flags2reg(list: taasmoutput; size: TCgSize; const f: TResFlags; reg: TRegister); - var testbit: byte; bitvalue: boolean; - begin { get the bit to extract from the conditional register + its requested value (0 or 1) } testbit := ((f.cr - RS_CR0) * 4); @@ -1375,7 +1343,7 @@ begin list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8)); countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE); a_load_const_reg(list, OS_32, count, countreg); - { explicitely allocate R_0 since it can be used safely here + { explicitely allocate F0 since it can be used safely here (for holding date that's being copied) } a_reg_alloc(list, NR_F0); objectlibrary.getjumplabel(lab); diff --git a/compiler/powerpc64/cpubase.pas b/compiler/powerpc64/cpubase.pas index a35962ec3d..839c2e6931 100644 --- a/compiler/powerpc64/cpubase.pas +++ b/compiler/powerpc64/cpubase.pas @@ -94,7 +94,7 @@ type A_SRDI, A_SRADI, A_SLDI, A_RLDICL, - A_DIVDU, A_DIVD, A_MULLD, A_SRAD, A_SLD, A_SRD, + A_DIVDU, A_DIVDU_, A_DIVD, A_DIVD_, A_MULLD, A_MULLD_, A_MULHD, A_MULHD_, A_SRAD, A_SLD, A_SRD, A_DIVDUO_, A_DIVDO_, A_LWA, A_LWAU, A_LWAX, A_LWAUX, A_FCFID, diff --git a/compiler/powerpc64/cpuinfo.pas b/compiler/powerpc64/cpuinfo.pas index 2c429825ea..7a59e11314 100644 --- a/compiler/powerpc64/cpuinfo.pas +++ b/compiler/powerpc64/cpuinfo.pas @@ -49,9 +49,7 @@ const { the difference to stdcall is only the name mangling } pocall_cdecl, { the difference to stdcall is only the name mangling } - pocall_cppdecl, - { pass all const records by reference } - pocall_mwpascal + pocall_cppdecl ]; processorsstr: array[tprocessors] of string[10] = ('', diff --git a/compiler/powerpc64/cpupara.pas b/compiler/powerpc64/cpupara.pas index b417a511fd..38ed3d9927 100644 --- a/compiler/powerpc64/cpupara.pas +++ b/compiler/powerpc64/cpupara.pas @@ -408,10 +408,11 @@ begin end; end; end; + curintreg := nextintreg; curfloatreg := nextfloatreg; curmmreg := nextmmreg; - cur_stack_offset := stack_offset; + cur_stack_offset := stack_offset; result := stack_offset; end; diff --git a/compiler/powerpc64/cpupi.pas b/compiler/powerpc64/cpupi.pas index 0c813ad15f..f733950e09 100644 --- a/compiler/powerpc64/cpupi.pas +++ b/compiler/powerpc64/cpupi.pas @@ -66,7 +66,7 @@ var begin if not (po_assembler in procdef.procoptions) then begin { align the stack properly } - ofs := align(maxpushedparasize + LinkageAreaSizeELF, ELF_STACK_ALIGN); + ofs := align(maxpushedparasize + LinkageAreaSizeELF, 8); { the ABI specification says that it is required to always allocate space for 8 * 8 bytes for registers R3-R10 and stack header if there's a stack frame, but GCC doesn't do that, @@ -74,7 +74,6 @@ begin // if (ofs < 112) then begin // ofs := 112; // end; - tg.setfirsttemp(ofs); end else begin locals := 0; diff --git a/compiler/powerpc64/itcpugas.pas b/compiler/powerpc64/itcpugas.pas index ce4ce4b5f3..810224d845 100644 --- a/compiler/powerpc64/itcpugas.pas +++ b/compiler/powerpc64/itcpugas.pas @@ -84,7 +84,7 @@ const 'srdi', 'sradi', 'sldi', 'rldicl', - 'divdu', 'divd', 'mulld', 'srad', 'sld', 'srd', + 'divdu', 'divdu.', 'divd', 'divd.', 'mulld', 'mulld.', 'mulhd', 'mulhd.', 'srad', 'sld', 'srd', 'divduo.', 'divdo.', 'lwa', '', 'lwax', 'lwaux', 'fcfid', diff --git a/compiler/powerpc64/nppcadd.pas b/compiler/powerpc64/nppcadd.pas index ecae422fe6..67212ac3bb 100644 --- a/compiler/powerpc64/nppcadd.pas +++ b/compiler/powerpc64/nppcadd.pas @@ -163,7 +163,6 @@ begin end end; -// Todo: ts: allow emiting word compares... procedure tppcaddnode.emit_compare(unsigned: boolean); var op: tasmop; @@ -175,8 +174,7 @@ begin swapleftright; // can we use an immediate, or do we have to load the // constant in a register first? - if (right.location.loc = LOC_CONSTANT) then - begin + if (right.location.loc = LOC_CONSTANT) then begin if (nodetype in [equaln, unequaln]) then if (unsigned and (aword(right.location.value) > high(word))) or @@ -193,15 +191,13 @@ begin (aint(right.location.value) >= low(smallint)) and (aint(right.location.value) <= high(smallint))) then useconst := true - else - begin + else begin useconst := false; tmpreg := cg.getintregister(exprasmlist, OS_INT); cg.a_load_const_reg(exprasmlist, OS_INT, right.location.value, tmpreg); end - end - else + end else useconst := false; location.loc := LOC_FLAGS; location.resflags := getresflags; @@ -215,15 +211,13 @@ begin else op := A_CMPLD; - if (right.location.loc = LOC_CONSTANT) then - begin + if (right.location.loc = LOC_CONSTANT) then begin if useconst then exprasmlist.concat(taicpu.op_reg_const(op, left.location.register, longint(right.location.value))) else exprasmlist.concat(taicpu.op_reg_reg(op, left.location.register, tmpreg)); - end - else + end else exprasmlist.concat(taicpu.op_reg_reg(op, left.location.register, right.location.register)); end; @@ -237,7 +231,7 @@ var cgop: TOpCg; cgsize: TCgSize; cmpop, - isjump: boolean; + isjump: boolean; otl, ofl: tasmlabel; begin { calculate the operator which is more difficult } @@ -525,7 +519,6 @@ begin cg.a_op_reg_reg(exprasmlist, OP_SHL, OS_64, right.location.register, tmpreg); if left.location.loc <> LOC_CONSTANT then begin - cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_64, tmpreg, left.location.register, location.register) end else begin diff --git a/compiler/powerpc64/nppcinl.pas b/compiler/powerpc64/nppcinl.pas index 5d758db142..938d58cfa1 100644 --- a/compiler/powerpc64/nppcinl.pas +++ b/compiler/powerpc64/nppcinl.pas @@ -36,6 +36,8 @@ type } function first_abs_real: tnode; override; function first_sqr_real: tnode; override; + { todo: inline trunc/round/frac?/int } + procedure second_abs_real; override; procedure second_sqr_real; override; procedure second_prefetch; override; diff --git a/compiler/powerpc64/nppcmat.pas b/compiler/powerpc64/nppcmat.pas index b5809bbbba..b7c372ef01 100644 --- a/compiler/powerpc64/nppcmat.pas +++ b/compiler/powerpc64/nppcmat.pas @@ -59,6 +59,176 @@ uses cpubase, cpuinfo, ncgutil, cgcpu, rgobj; +{ helper functions } +procedure getmagic_unsigned32(d : dword; out magic_m : dword; out magic_add : boolean; out magic_shift : dword); +var + p : longint; + nc, delta, q1, r1, q2, r2 : dword; + +begin + assert(d > 0); + + magic_add := false; + nc := - 1 - (-d) mod d; + p := 31; { initialize p } + q1 := $80000000 div nc; { initialize q1 = 2p/nc } + r1 := $80000000 - q1*nc; { initialize r1 = rem(2p,nc) } + q2 := $7FFFFFFF div d; { initialize q2 = (2p-1)/d } + r2 := $7FFFFFFF - q2*d; { initialize r2 = rem((2p-1),d) } + repeat + inc(p); + if (r1 >= (nc - r1)) then begin + q1 := 2 * q1 + 1; { update q1 } + r1 := 2*r1 - nc; { update r1 } + end else begin + q1 := 2*q1; { update q1 } + r1 := 2*r1; { update r1 } + end; + if ((r2 + 1) >= (d - r2)) then begin + if (q2 >= $7FFFFFFF) then + magic_add := true; + q2 := 2*q2 + 1; { update q2 } + r2 := 2*r2 + 1 - d; { update r2 } + end else begin + if (q2 >= $80000000) then + magic_add := true; + q2 := 2*q2; { update q2 } + r2 := 2*r2 + 1; { update r2 } + end; + delta := d - 1 - r2; + until not ((p < 64) and ((q1 < delta) or ((q1 = delta) and (r1 = 0)))); + magic_m := q2 + 1; { resulting magic number } + magic_shift := p - 32; { resulting shift } +end; + +procedure getmagic_signed32(d : longint; out magic_m : longint; out magic_s : longint); +const + two_31 : DWord = high(longint)+1; +var + p : Longint; + ad, anc, delta, q1, r1, q2, r2, t : DWord; + +begin + assert((d < -1) or (d > 1)); + + ad := abs(d); + t := two_31 + (DWord(d) shr 31); + anc := t - 1 - t mod ad; { absolute value of nc } + p := 31; { initialize p } + q1 := two_31 div anc; { initialize q1 = 2p/abs(nc) } + r1 := two_31 - q1*anc; { initialize r1 = rem(2p,abs(nc)) } + q2 := two_31 div ad; { initialize q2 = 2p/abs(d) } + r2 := two_31 - q2*ad; { initialize r2 = rem(2p,abs(d)) } + repeat + inc(p); + q1 := 2*q1; { update q1 = 2p/abs(nc) } + r1 := 2*r1; { update r1 = rem(2p/abs(nc)) } + if (r1 >= anc) then begin { must be unsigned comparison } + inc(q1); + dec(r1, anc); + end; + q2 := 2*q2; { update q2 = 2p/abs(d) } + r2 := 2*r2; { update r2 = rem(2p/abs(d)) } + if (r2 >= ad) then begin { must be unsigned comparison } + inc(q2); + dec(r2, ad); + end; + delta := ad - r2; + until not ((q1 < delta) or ((q1 = delta) and (r1 = 0))); + magic_m := q2 + 1; + if (d < 0) then begin + magic_m := -magic_m; { resulting magic number } + end; + magic_s := p - 32; { resulting shift } +end; + +{ helper functions } +procedure getmagic_unsigned64(d : qword; out magic_m : qword; out magic_add : boolean; out magic_shift : qword); +const + two_63 : QWord = $8000000000000000; +var + p : int64; + nc, delta, q1, r1, q2, r2 : qword; + +begin + assert(d > 0); + + magic_add := false; + nc := - 1 - (-d) mod d; + p := 63; { initialize p } + q1 := two_63 div nc; { initialize q1 = 2p/nc } + r1 := two_63 - q1*nc; { initialize r1 = rem(2p,nc) } + q2 := (two_63-1) div d; { initialize q2 = (2p-1)/d } + r2 := (two_63-1) - q2*d; { initialize r2 = rem((2p-1),d) } + repeat + inc(p); + if (r1 >= (nc - r1)) then begin + q1 := 2 * q1 + 1; { update q1 } + r1 := 2*r1 - nc; { update r1 } + end else begin + q1 := 2*q1; { update q1 } + r1 := 2*r1; { update r1 } + end; + if ((r2 + 1) >= (d - r2)) then begin + if (q2 >= (two_63-1)) then + magic_add := true; + q2 := 2*q2 + 1; { update q2 } + r2 := 2*r2 + 1 - d; { update r2 } + end else begin + if (q2 >= two_63) then + magic_add := true; + q2 := 2*q2; { update q2 } + r2 := 2*r2 + 1; { update r2 } + end; + delta := d - 1 - r2; + until not ((p < 128) and ((q1 < delta) or ((q1 = delta) and (r1 = 0)))); + magic_m := q2 + 1; { resulting magic number } + magic_shift := p - 64; { resulting shift } +end; + +procedure getmagic_signed64(d : int64; out magic_m : int64; out magic_s : int64); +const + two_63 : QWord = $8000000000000000; +var + p : int64; + ad, anc, delta, q1, r1, q2, r2, t : QWord; + +begin + assert((d < -1) or (d > 1)); + + ad := abs(d); + t := two_63 + (QWord(d) shr 63); + anc := t - 1 - t mod ad; { absolute value of nc } + p := 63; { initialize p } + q1 := two_63 div anc; { initialize q1 = 2p/abs(nc) } + r1 := two_63 - q1*anc; { initialize r1 = rem(2p,abs(nc)) } + q2 := two_63 div ad; { initialize q2 = 2p/abs(d) } + r2 := two_63 - q2*ad; { initialize r2 = rem(2p,abs(d)) } + repeat + inc(p); + q1 := 2*q1; { update q1 = 2p/abs(nc) } + r1 := 2*r1; { update r1 = rem(2p/abs(nc)) } + if (r1 >= anc) then begin { must be unsigned comparison } + inc(q1); + dec(r1, anc); + end; + q2 := 2*q2; { update q2 = 2p/abs(d) } + r2 := 2*r2; { update r2 = rem(2p/abs(d)) } + if (r2 >= ad) then begin { must be unsigned comparison } + inc(q2); + dec(r2, ad); + end; + delta := ad - r2; + until not ((q1 < delta) or ((q1 = delta) and (r1 = 0))); + magic_m := q2 + 1; + if (d < 0) then begin + magic_m := -magic_m; { resulting magic number } + end; + magic_s := p - 64; { resulting shift } +end; + + + {***************************************************************************** TPPCMODDIVNODE *****************************************************************************} @@ -70,6 +240,200 @@ begin include(current_procinfo.flags, pi_do_call); end; +procedure tppcmoddivnode.pass_2; +const { signed overflow } + divops: array[boolean, boolean] of tasmop = + ((A_DIVDU,A_DIVDU_),(A_DIVD,A_DIVDO_)); + zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond:C_NE; cr: RS_CR7); +var + power : longint; + op : tasmop; + numerator, divider, + resultreg : tregister; + size : TCgSize; + hl : tasmlabel; + done: boolean; + + procedure genOrdConstNodeDiv; + const + negops : array[boolean] of tasmop = (A_NEG, A_NEGO); + var + magic, shift : int64; + u_magic, u_shift : qword; + u_add : boolean; + + divreg : tregister; + begin + if (tordconstnode(right).value = 0) then begin + internalerror(2005061701); + end else if (tordconstnode(right).value = 1) then begin + cg.a_load_reg_reg(exprasmlist, OS_INT, OS_INT, numerator, resultreg); + end else if (tordconstnode(right).value = -1) then begin + { note: only in the signed case possible..., may overflow } + exprasmlist.concat(taicpu.op_reg_reg(negops[cs_check_overflow in aktlocalswitches], resultreg, numerator)); + end else if (ispowerof2(tordconstnode(right).value, power)) then begin + if (is_signed(right.resulttype.def)) then begin + { From "The PowerPC Compiler Writer's Guide", pg. 52ff } + cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, power, + numerator, resultreg); + exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, resultreg, resultreg)); + end else begin + cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg) + end; + end else begin + { replace division by multiplication, both implementations } + { from "The PowerPC Compiler Writer's Guide" pg. 53ff } + divreg := cg.getintregister(exprasmlist, OS_INT); + if (is_signed(right.resulttype.def)) then begin + getmagic_signed64(tordconstnode(right).value, magic, shift); + { load magic value } + cg.a_load_const_reg(exprasmlist, OS_INT, magic, divreg); + { multiply } + exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHD, resultreg, numerator, divreg)); + { add/subtract numerator } + if (tordconstnode(right).value > 0) and (magic < 0) then begin + cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, numerator, resultreg, resultreg); + end else if (tordconstnode(right).value < 0) and (magic > 0) then begin + cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, numerator, resultreg, resultreg); + end; + { shift shift places to the right (arithmetic) } + cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, shift, resultreg, resultreg); + { extract and add sign bit } + if (tordconstnode(right).value >= 0) then begin + cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, numerator, divreg); + end else begin + cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, resultreg, divreg); + end; + cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, resultreg, divreg, resultreg); + end else begin + getmagic_unsigned64(tordconstnode(right).value, u_magic, u_add, u_shift); + { load magic in divreg } + cg.a_load_const_reg(exprasmlist, OS_INT, u_magic, divreg); + exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHDU, resultreg, numerator, divreg)); + if (u_add) then begin + cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, divreg); + cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 1, divreg, divreg); + cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, divreg, resultreg, divreg); + cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift-1, divreg, resultreg); + end else begin + cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift, resultreg, resultreg); + end; + end; + end; + done := true; + end; + + procedure genOrdConstNodeMod; + var + modreg, maskreg, tempreg : tregister; + begin + if (tordconstnode(right).value = 0) then begin + internalerror(2005061702); + end else if (abs(tordconstnode(right).value) = 1) then begin + { x mod +/-1 is always zero } + cg.a_load_const_reg(exprasmlist, OS_INT, 0, resultreg); + end else if (ispowerof2(tordconstnode(right).value, power)) then begin + if (is_signed(right.resulttype.def)) then begin + tempreg := cg.getintregister(exprasmlist, OS_INT); + maskreg := cg.getintregister(exprasmlist, OS_INT); + modreg := cg.getintregister(exprasmlist, OS_INT); + + cg.a_load_const_reg(exprasmlist, OS_INT, abs(tordconstnode(right).value)-1, modreg); + cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, 63, numerator, maskreg); + cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, numerator, modreg, tempreg); + + exprasmlist.concat(taicpu.op_reg_reg_reg(A_ANDC, maskreg, maskreg, modreg)); + exprasmlist.concat(taicpu.op_reg_reg_const(A_SUBFIC, modreg, tempreg, 0)); + exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUBFE, modreg, modreg, modreg)); + cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, modreg, maskreg, maskreg); + cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_INT, maskreg, tempreg, resultreg); + end else begin + cg.a_op_const_reg_reg(exprasmlist, OP_AND, OS_INT, tordconstnode(right).value-1, numerator, resultreg); + end; + end else begin + genOrdConstNodeDiv(); + cg.a_op_const_reg_reg(exprasmlist, OP_MUL, OS_INT, tordconstnode(right).value, resultreg, resultreg); + cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, resultreg); + end; + end; + + +begin + secondpass(left); + secondpass(right); + location_copy(location,left.location); + + { put numerator in register } + size:=def_cgsize(left.resulttype.def); + location_force_reg(exprasmlist,left.location, + size,true); + location_copy(location,left.location); + numerator := location.register; + resultreg := location.register; + if (location.loc = LOC_CREGISTER) then begin + location.loc := LOC_REGISTER; + location.register := cg.getintregister(exprasmlist,size); + resultreg := location.register; + end else if (nodetype = modn) or (right.nodetype = ordconstn) then begin + { for a modulus op, and for const nodes we need the result register + to be an extra register } + resultreg := cg.getintregister(exprasmlist,size); + end; + done := false; +(* + if (right.nodetype = ordconstn) then begin + if (nodetype = divn) then + genOrdConstNodeDiv + else + genOrdConstNodeMod; + done := true; + end; +*) + + if (not done) then begin + { load divider in a register if necessary } + location_force_reg(exprasmlist,right.location, + def_cgsize(right.resulttype.def),true); + if (right.nodetype <> ordconstn) then + exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR7, + right.location.register, 0)) + else begin + if (tordconstnode(right).value = 0) then + internalerror(2005100301); + end; + divider := right.location.register; + + { needs overflow checking, (-maxlongint-1) div (-1) overflows! } + op := divops[is_signed(right.resulttype.def), + cs_check_overflow in aktlocalswitches]; + exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator, + divider)); + + if (nodetype = modn) then begin + exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD,resultreg, + divider,resultreg)); + exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB,location.register, + numerator,resultreg)); + resultreg := location.register; + end; + end; + { set result location } + location.loc:=LOC_REGISTER; + location.register:=resultreg; + if right.nodetype <> ordconstn then begin + objectlibrary.getjumplabel(hl); + exprasmlist.concat(taicpu.op_cond_sym(A_BC,zerocond,hl)); + cg.a_call_name(exprasmlist,'FPC_DIVBYZERO'); + cg.a_label(exprasmlist,hl); + end; + { unsigned division/module can only overflow in case of division by zero + (but checking this overflow flag is more convoluted than performing a + simple comparison with 0) } + if is_signed(right.resulttype.def) then + cg.g_overflowcheck(exprasmlist,location,resulttype.def); +end; + +(* procedure tppcmoddivnode.pass_2; const // ts: todo, use 32 bit operations if possible (much faster!) @@ -130,9 +494,7 @@ begin end else begin cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg); end; - end - else - begin + end else begin { load divider in a register if necessary } location_force_reg(exprasmlist, right.location, def_cgsize(right.resulttype.def), true); @@ -150,8 +512,7 @@ begin exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator, divider)); - if (nodetype = modn) then - begin + if (nodetype = modn) then begin {$NOTE ts:testme} exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD, resultreg, divider, resultreg)); @@ -163,8 +524,7 @@ begin { set result location } location.loc := LOC_REGISTER; location.register := resultreg; - if right.nodetype <> ordconstn then - begin + if (right.nodetype <> ordconstn) then begin objectlibrary.getjumplabel(hl); exprasmlist.concat(taicpu.op_cond_sym(A_BC, zerocond, hl)); cg.a_call_name(exprasmlist, 'FPC_DIVBYZERO'); @@ -172,7 +532,7 @@ begin end; cg.g_overflowcheck(exprasmlist, location, resulttype.def); end; - +*) {***************************************************************************** TPPCSHLRSHRNODE *****************************************************************************} @@ -181,8 +541,8 @@ end; procedure tppcshlshrnode.pass_2; var - resultreg, hregister1, hregister2, - hreg64hi, hreg64lo: tregister; + resultreg, hregister1, hregister2 : tregister; + op: topcg; asmop1, asmop2: tasmop; shiftval: aint; @@ -199,7 +559,7 @@ begin hregister1 := location.register; if (location.loc = LOC_CREGISTER) then begin location.loc := LOC_REGISTER; - resultreg := cg.getintregister(exprasmlist, OS_64); + resultreg := cg.getintregister(exprasmlist, OS_INT); location.register := resultreg; end; @@ -257,17 +617,14 @@ begin end; LOC_REFERENCE, LOC_CREFERENCE: begin - if (left.resulttype.def.deftype = floatdef) then - begin + if (left.resulttype.def.deftype = floatdef) then begin src1 := cg.getfpuregister(exprasmlist, def_cgsize(left.resulttype.def)); location.register := src1; cg.a_loadfpu_ref_reg(exprasmlist, def_cgsize(left.resulttype.def), left.location.reference, src1); - end - else - begin + end else begin src1 := cg.getintregister(exprasmlist, OS_64); location.register := src1; cg.a_load_ref_reg(exprasmlist, OS_64, OS_64, @@ -276,28 +633,19 @@ begin end; end; { choose appropriate operand } - if left.resulttype.def.deftype <> floatdef then - begin + if left.resulttype.def.deftype <> floatdef then begin if not (cs_check_overflow in aktlocalswitches) then op := A_NEG else op := A_NEGO_; location.loc := LOC_REGISTER; - end - else - begin + end else begin op := A_FNEG; location.loc := LOC_FPUREGISTER; end; { emit operation } exprasmlist.concat(taicpu.op_reg_reg(op, location.register, src1)); end; - { Here was a problem... } - { Operand to be negated always } - { seems to be converted to signed } - { 32-bit before doing neg!! } - { So this is useless... } - { that's not true: -2^31 gives an overflow error if it is negated (FK) } cg.g_overflowcheck(exprasmlist, location, resulttype.def); end;