* a_op_const_reg_reg optimizations

* added some more 64 bit PPC opcodes
* removed last mwpascal references
* added replacement of division/modulo by constant by multiplications and shifts for 64 bit
* general cleanup

git-svn-id: trunk@1648 -
This commit is contained in:
tom_at_work 2005-11-04 22:49:05 +00:00
parent 4bd32a686d
commit e4a61f4af1
9 changed files with 524 additions and 215 deletions

View File

@ -157,16 +157,9 @@ type
end;
const
TOpCG2AsmOpConstLo: array[topcg] of TAsmOp = (A_NONE, A_ADDI, A_ANDI_,
A_DIVWU,
A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE, A_ORI,
A_SRAWI, A_SLWI, A_SRWI, A_SUBI, A_XORI);
TOpCG2AsmOpConstHi: array[topcg] of TAsmOp = (A_NONE, A_ADDIS, A_ANDIS_,
A_DIVWU, A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE,
A_ORIS, A_NONE, A_NONE, A_NONE, A_SUBIS, A_XORIS);
TShiftOpCG2AsmOpConst32 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRAWI, A_SLWI, A_SRWI);
TShiftOpCG2AsmOpConst64 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRADI, A_SLDI, A_SRDI);
TShiftOpCG2AsmOpConst : array[boolean, OP_SAR..OP_SHR] of TAsmOp = (
(A_SRAWI, A_SLWI, A_SRWI), (A_SRADI, A_SLDI, A_SRDI)
);
TOpCmp2AsmCond: array[topcmp] of TAsmCondFlag = (C_NONE, C_EQ, C_GT,
C_LT, C_GE, C_LE, C_NE, C_LE, C_LT, C_GE, C_GT);
@ -248,10 +241,13 @@ begin
location^.register)
else
{ load non-integral sized memory location into register. This
memory location be 1-sizeleft byte sized.
Always assume that this memory area is properly aligned, eg. start
loading the larger quantities for "odd" quantities first }
memory location be 1-sizeleft byte sized.
Always assume that this memory area is properly aligned, eg. start
loading the larger quantities for "odd" quantities first }
case sizeleft of
1,2,4,8 :
a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref,
location^.register);
3 : begin
a_reg_alloc(list, NR_R12);
a_load_ref_reg(list, OS_16, location^.size, tmpref,
@ -259,7 +255,7 @@ begin
inc(tmpref.offset, tcgsize2size[OS_16]);
a_load_ref_reg(list, OS_8, location^.size, tmpref,
location^.register);
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40));
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40));
a_reg_dealloc(list, NR_R12);
end;
5 : begin
@ -267,8 +263,8 @@ begin
a_load_ref_reg(list, OS_32, location^.size, tmpref, NR_R12);
inc(tmpref.offset, tcgsize2size[OS_32]);
a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register);
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24));
a_reg_dealloc(list, NR_R12);
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24));
a_reg_dealloc(list, NR_R12);
end;
6 : begin
a_reg_alloc(list, NR_R12);
@ -286,20 +282,16 @@ begin
a_load_ref_reg(list, OS_16, location^.size, tmpref, NR_R0);
inc(tmpref.offset, tcgsize2size[OS_16]);
a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register);
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16));
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8));
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16));
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8));
a_reg_dealloc(list, NR_R0);
a_reg_dealloc(list, NR_R12);
end;
1,2,4,8 :
a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref,
location^.register);
else
else
{ still > 8 bytes to load, so load data single register now }
a_load_ref_reg(list, location^.size, location^.size, tmpref,
location^.register);
end;
// a_load_ref_reg(list, location^.size, location^.size, tmpref,
// location^.register);
end;
LOC_REFERENCE:
begin
@ -368,12 +360,8 @@ begin
AT_FUNCTION)));
if (addNOP) then
list.concat(taicpu.op_none(A_NOP));
{
the compiler does not properly set this flag anymore in pass 1, and
for now we only need it after pass 2 (I hope) (JM)
if not(pi_do_call in current_procinfo.flags) then
internalerror(2003060703);
}
{ the compiler does not properly set this flag anymore in pass 1, and
for now we only need it after pass 2 (I hope) (JM) }
include(current_procinfo.flags, pi_do_call);
end;
@ -503,9 +491,9 @@ begin
32 bits should contain -1
- loading the lower 32 bits resulted in 0 in the upper 32 bits, and the upper
32 bits should contain 0 }
load32bitconstantR0(list, size, hi(a), NR_R0);
load32bitconstant(list, size, hi(a), NR_R12);
{ combine both registers }
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R0, 32, 0));
list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R12, 32, 0));
end;
end;
end;
@ -550,7 +538,7 @@ const
((A_LBZ, A_LBZU), (A_LBZX, A_LBZUX)),
((A_LHA, A_LHAU), (A_LHAX, A_LHAUX)),
{ there's no load-word-arithmetic-indexed with update, simulate it in code :( }
((A_LWA, A_LWAU), (A_LWAX, A_LWAUX)),
((A_LWA, A_NOP), (A_LWAX, A_LWAUX)),
((A_LD, A_LDU), (A_LDX, A_LDUX))
);
var
@ -563,12 +551,12 @@ begin
ref2 := ref;
fixref(list, ref2, tosize);
{ the caller is expected to have adjusted the reference already
in this case }
in this case }
if (TCGSize2Size[fromsize] >= TCGSize2Size[tosize]) then
fromsize := tosize;
op := loadinstr[fromsize, ref2.index <> NR_NO, false];
{ there is no LWAU instruction, simulate using ADDI and LWA }
if (op = A_LWAU) then begin
if (op = A_NOP) then begin
list.concat(taicpu.op_reg_reg_const(A_ADDI, reg, reg, ref2.offset));
ref2.offset := 0;
op := A_LWA;
@ -605,8 +593,8 @@ var
begin
op := movemap[fromsize, tosize];
case op of
A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1);
A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8);
A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1);
A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8);
else
internalerror(2002090901);
end;
@ -614,8 +602,8 @@ begin
rg[R_INTREGISTER].add_move_instruction(instr);
end;
procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; reg1, reg2:
tregister);
procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize;
reg1, reg2: tregister);
var
instr: taicpu;
begin
@ -624,8 +612,8 @@ begin
rg[R_FPUREGISTER].add_move_instruction(instr);
end;
procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; const ref:
treference; reg: tregister);
procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize;
const ref: treference; reg: tregister);
const
FpuLoadInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp =
{ indexed? updating?}
@ -654,7 +642,6 @@ end;
procedure tcgppc.a_loadfpu_reg_ref(list: taasmoutput; size: tcgsize; reg:
tregister; const ref: treference);
const
FpuStoreInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp =
{ indexed? updating? }
@ -688,139 +675,131 @@ end;
procedure tcgppc.a_op_const_reg_reg(list: taasmoutput; op: TOpCg;
size: tcgsize; a: aint; src, dst: tregister);
var
l1, l2: longint;
oplo, ophi: tasmop;
scratchreg: tregister;
useReg : boolean;
shiftmask : longint;
procedure do_lo_hi;
procedure do_lo_hi(loOp, hiOp : TAsmOp);
begin
{ Optimization for logical ops (excluding AND), trying to do this as efficiently
as possible by only generating code for the affected halfwords. Note that all
the instructions handled here must have "X op 0 = X" for every halfword. }
usereg := false;
if (size in [OS_64, OS_S64]) then begin
{ ts: use register method for 64 bit consts. Sloooooow }
if (aword(a) > high(dword)) then begin
usereg := true;
end else if (size in [OS_32, OS_S32]) then begin
list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a)));
list.concat(taicpu.op_reg_reg_const(ophi, dst, dst, word(a shr 16)));
end else begin
list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a)));
if (word(a) <> 0) then begin
list.concat(taicpu.op_reg_reg_const(loOp, dst, src, word(a)));
if (word(a shr 16) <> 0) then
list.concat(taicpu.op_reg_reg_const(hiOp, dst, dst, word(a shr 16)));
end else if (word(a shr 16) <> 0) then
list.concat(taicpu.op_reg_reg_const(hiOp, dst, src, word(a shr 16)));
end;
end;
procedure do_lo_hi_and;
begin
{ optimization logical and with immediate: only use "andi." for 16 bit
ands, otherwise use register method. Doing this for 32 bit constants
would not give any advantage to the register method (via useReg := true),
requiring a scratch register and three instructions. }
usereg := false;
if (aword(a) > high(word)) then
usereg := true
else
list.concat(taicpu.op_reg_reg_const(A_ANDI_, dst, src, word(a)));
end;
var
scratchreg: tregister;
shift, shiftmask : longint;
begin
{ subtraction is the same as addition with negative constant }
if op = OP_SUB then begin
a_op_const_reg_reg(list, OP_ADD, size, -a, src, dst);
exit;
end;
ophi := TOpCG2AsmOpConstHi[op];
oplo := TOpCG2AsmOpConstLo[op];
{ peephole optimizations for AND, OR, XOR - can't this be done at
some higher level, independent of architecture? }
if (op in [OP_AND, OP_OR, OP_XOR]) then begin
if (a = 0) then begin
if op = OP_AND then
list.concat(taicpu.op_reg_const(A_LI, dst, 0))
else
a_load_reg_reg(list, size, size, src, dst);
exit;
end else if (a = -1) then begin
case op of
OP_OR:
list.concat(taicpu.op_reg_const(A_LI, dst, -1));
OP_XOR:
list.concat(taicpu.op_reg_reg(A_NOT, dst, src));
OP_AND:
a_load_reg_reg(list, size, size, src, dst);
end;
exit;
end;
{ optimization for add }
end else if (op = OP_ADD) then
if a = 0 then begin
a_load_reg_reg(list, size, size, src, dst);
exit;
end else if (a >= low(smallint)) and (a <= high(smallint)) then begin
list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a)));
exit;
end;
{ This case includes some peephole optimizations for the various operations,
(e.g. AND, OR, XOR, ..) - can't this be done at some higher level,
independent of architecture? }
{ otherwise, the instructions we can generate depend on the operation }
{ assume that we do not need a scratch register for the operation }
useReg := false;
case op of
case (op) of
OP_DIV, OP_IDIV:
{ actually, this method should be never called directly with OP_DIV or
OP_IDIV, so just provide basic support.
TODO: move division by constant stuff from nppcmat.pas here }
if (a = 0) then
internalerror(200208103)
else if (a = 1) then begin
a_load_reg_reg(list, OS_INT, OS_INT, src, dst);
exit
end else if false {and ispowerof2(a, l1)} then begin
internalerror(200208103);
case op of
OP_DIV: begin
list.concat(taicpu.op_reg_reg_const(A_SRDI, dst, src, l1));
end;
OP_IDIV:
begin
list.concat(taicpu.op_reg_reg_const(A_SRADI, dst, src, l1));
list.concat(taicpu.op_reg_reg(A_ADDZE, dst, dst));
end;
end;
exit;
end else
usereg := true;
else if (a = 1) then
a_load_reg_reg(list, size, size, src, dst)
else
usereg := true;
OP_IMUL, OP_MUL:
if (a = 0) then begin
list.concat(taicpu.op_reg_const(A_LI, dst, 0));
exit
end else if (a = -1) then begin
list.concat(taicpu.op_reg_reg(A_NEG, dst, dst));
end else if (a = 1) then begin
a_load_reg_reg(list, OS_INT, OS_INT, src, dst);
exit
end else if ispowerof2(a, l1) then
list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, l1))
{ idea: factorize constant multiplicands and use adds/shifts with few factors;
however, even a 64 bit multiply is already quite fast on PPC64 }
if (a = 0) then
a_load_const_reg(list, size, 0, dst)
else if (a = -1) then
list.concat(taicpu.op_reg_reg(A_NEG, dst, dst))
else if (a = 1) then
a_load_reg_reg(list, OS_INT, OS_INT, src, dst)
else if ispowerof2(a, shift) then
list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, shift))
else if (a >= low(smallint)) and (a <= high(smallint)) then
list.concat(taicpu.op_reg_reg_const(A_MULLI, dst, src,
smallint(a)))
else
usereg := true;
OP_ADD:
{$todo ts:optimize}
useReg := true;
if (a = 0) then
a_load_reg_reg(list, size, size, src, dst)
else if (a >= low(smallint)) and (a <= high(smallint)) then
list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a)))
else
useReg := true;
OP_OR:
do_lo_hi;
if (a = 0) then
a_load_reg_reg(list, size, size, src, dst)
else if (a = -1) then
a_load_const_reg(list, size, -1, dst)
else
do_lo_hi(A_ORI, A_ORIS);
OP_AND:
useReg := true;
if (a = 0) then
a_load_const_reg(list, size, 0, dst)
else if (a = -1) then
a_load_reg_reg(list, size, size, src, dst)
else
do_lo_hi_and;
OP_XOR:
do_lo_hi;
if (a = 0) then
a_load_reg_reg(list, size, size, src, dst)
else if (a = -1) then
list.concat(taicpu.op_reg_reg(A_NOT, dst, src))
else
do_lo_hi(A_XORI, A_XORIS);
OP_SHL, OP_SHR, OP_SAR:
begin
{$note ts: cleanup todo, fix remaining bugs}
if (size in [OS_64, OS_S64]) then begin
if (a and 63) <> 0 then
list.concat(taicpu.op_reg_reg_const(
TShiftOpCG2AsmOpConst64[Op], dst, src, a and 63))
else
a_load_reg_reg(list, size, size, src, dst);
if (a shr 6) <> 0 then
internalError(68991);
end else begin
if (a and 31) <> 0 then
list.concat(taicpu.op_reg_reg_const(
TShiftOpCG2AsmOpConst32[Op], dst, src, a and 31))
else
a_load_reg_reg(list, size, size, src, dst);
if (a shr 5) <> 0 then
internalError(68991);
end;
if (size in [OS_64, OS_S64]) then
shift := 6
else
shift := 5;
shiftmask := (1 shl shift)-1;
if (a and shiftmask) <> 0 then
list.concat(taicpu.op_reg_reg_const(
TShiftOpCG2AsmOpConst[size in [OS_64, OS_S64], op], dst, src, a and shiftmask))
else
a_load_reg_reg(list, size, size, src, dst);
if ((a shr shift) <> 0) then
internalError(68991);
end
else
internalerror(200109091);
else
internalerror(200109091);
end;
{ if all else failed, load the constant in a register and then }
{ perform the operation }
if useReg then begin
{ if all else failed, load the constant in a register and then
perform the operation }
if (useReg) then begin
scratchreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
a_load_const_reg(list, size, a, scratchreg);
a_op_reg_reg_reg(list, op, size, scratchreg, src, dst);
@ -843,35 +822,29 @@ begin
OP_NEG, OP_NOT:
begin
list.concat(taicpu.op_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src1));
if (op = OP_NOT) and
not (size in [OS_64, OS_S64]) then
if (op = OP_NOT) and not (size in [OS_64, OS_S64]) then
{ zero/sign extend result again, fromsize is not important here }
a_load_reg_reg(list, OS_S64, size, dst, dst)
end;
else
{$NOTE ts:testme}
if (size in [OS_64, OS_S64]) then begin
list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2,
src1));
end else begin
list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2,
src1));
end;
else
if (size in [OS_64, OS_S64]) then begin
list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2,
src1));
end else begin
list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2,
src1));
end;
end;
end;
{*************** compare instructructions ****************}
procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; cmp_op:
topcmp; a: aint; reg: tregister;
l: tasmlabel);
procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize;
cmp_op: topcmp; a: aint; reg: tregister; l: tasmlabel);
var
scratch_register: TRegister;
signed: boolean;
begin
{ todo: use 32 bit compares? }
signed := cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE];
{ in the following case, we generate more efficient code when }
{ signed is true }
@ -897,13 +870,10 @@ begin
a_jmp(list, A_BC, TOpCmp2AsmCond[cmp_op], 0, l);
end;
procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; cmp_op:
topcmp;
reg1, reg2: tregister; l: tasmlabel);
procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize;
cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);
var
op: tasmop;
begin
if cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE] then
if (size in [OS_64, OS_S64]) then
@ -953,11 +923,9 @@ end;
procedure tcgppc.g_flags2reg(list: taasmoutput; size: TCgSize; const f:
TResFlags; reg: TRegister);
var
testbit: byte;
bitvalue: boolean;
begin
{ get the bit to extract from the conditional register + its requested value (0 or 1) }
testbit := ((f.cr - RS_CR0) * 4);
@ -1375,7 +1343,7 @@ begin
list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8));
countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
a_load_const_reg(list, OS_32, count, countreg);
{ explicitely allocate R_0 since it can be used safely here
{ explicitely allocate F0 since it can be used safely here
(for holding date that's being copied) }
a_reg_alloc(list, NR_F0);
objectlibrary.getjumplabel(lab);

View File

@ -94,7 +94,7 @@ type
A_SRDI, A_SRADI,
A_SLDI,
A_RLDICL,
A_DIVDU, A_DIVD, A_MULLD, A_SRAD, A_SLD, A_SRD,
A_DIVDU, A_DIVDU_, A_DIVD, A_DIVD_, A_MULLD, A_MULLD_, A_MULHD, A_MULHD_, A_SRAD, A_SLD, A_SRD,
A_DIVDUO_, A_DIVDO_,
A_LWA, A_LWAU, A_LWAX, A_LWAUX,
A_FCFID,

View File

@ -49,9 +49,7 @@ const
{ the difference to stdcall is only the name mangling }
pocall_cdecl,
{ the difference to stdcall is only the name mangling }
pocall_cppdecl,
{ pass all const records by reference }
pocall_mwpascal
pocall_cppdecl
];
processorsstr: array[tprocessors] of string[10] = ('',

View File

@ -408,10 +408,11 @@ begin
end;
end;
end;
curintreg := nextintreg;
curfloatreg := nextfloatreg;
curmmreg := nextmmreg;
cur_stack_offset := stack_offset;
cur_stack_offset := stack_offset;
result := stack_offset;
end;

View File

@ -66,7 +66,7 @@ var
begin
if not (po_assembler in procdef.procoptions) then begin
{ align the stack properly }
ofs := align(maxpushedparasize + LinkageAreaSizeELF, ELF_STACK_ALIGN);
ofs := align(maxpushedparasize + LinkageAreaSizeELF, 8);
{ the ABI specification says that it is required to always allocate space for 8 * 8 bytes
for registers R3-R10 and stack header if there's a stack frame, but GCC doesn't do that,
@ -74,7 +74,6 @@ begin
// if (ofs < 112) then begin
// ofs := 112;
// end;
tg.setfirsttemp(ofs);
end else begin
locals := 0;

View File

@ -84,7 +84,7 @@ const
'srdi', 'sradi',
'sldi',
'rldicl',
'divdu', 'divd', 'mulld', 'srad', 'sld', 'srd',
'divdu', 'divdu.', 'divd', 'divd.', 'mulld', 'mulld.', 'mulhd', 'mulhd.', 'srad', 'sld', 'srd',
'divduo.', 'divdo.',
'lwa', '<illegal lwau>', 'lwax', 'lwaux',
'fcfid',

View File

@ -163,7 +163,6 @@ begin
end
end;
// Todo: ts: allow emiting word compares...
procedure tppcaddnode.emit_compare(unsigned: boolean);
var
op: tasmop;
@ -175,8 +174,7 @@ begin
swapleftright;
// can we use an immediate, or do we have to load the
// constant in a register first?
if (right.location.loc = LOC_CONSTANT) then
begin
if (right.location.loc = LOC_CONSTANT) then begin
if (nodetype in [equaln, unequaln]) then
if (unsigned and
(aword(right.location.value) > high(word))) or
@ -193,15 +191,13 @@ begin
(aint(right.location.value) >= low(smallint)) and
(aint(right.location.value) <= high(smallint))) then
useconst := true
else
begin
else begin
useconst := false;
tmpreg := cg.getintregister(exprasmlist, OS_INT);
cg.a_load_const_reg(exprasmlist, OS_INT,
right.location.value, tmpreg);
end
end
else
end else
useconst := false;
location.loc := LOC_FLAGS;
location.resflags := getresflags;
@ -215,15 +211,13 @@ begin
else
op := A_CMPLD;
if (right.location.loc = LOC_CONSTANT) then
begin
if (right.location.loc = LOC_CONSTANT) then begin
if useconst then
exprasmlist.concat(taicpu.op_reg_const(op, left.location.register,
longint(right.location.value)))
else
exprasmlist.concat(taicpu.op_reg_reg(op, left.location.register, tmpreg));
end
else
end else
exprasmlist.concat(taicpu.op_reg_reg(op,
left.location.register, right.location.register));
end;
@ -237,7 +231,7 @@ var
cgop: TOpCg;
cgsize: TCgSize;
cmpop,
isjump: boolean;
isjump: boolean;
otl, ofl: tasmlabel;
begin
{ calculate the operator which is more difficult }
@ -525,7 +519,6 @@ begin
cg.a_op_reg_reg(exprasmlist, OP_SHL, OS_64,
right.location.register, tmpreg);
if left.location.loc <> LOC_CONSTANT then begin
cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_64, tmpreg,
left.location.register, location.register)
end else begin

View File

@ -36,6 +36,8 @@ type
}
function first_abs_real: tnode; override;
function first_sqr_real: tnode; override;
{ todo: inline trunc/round/frac?/int }
procedure second_abs_real; override;
procedure second_sqr_real; override;
procedure second_prefetch; override;

View File

@ -59,6 +59,176 @@ uses
cpubase, cpuinfo,
ncgutil, cgcpu, rgobj;
{ helper functions }
procedure getmagic_unsigned32(d : dword; out magic_m : dword; out magic_add : boolean; out magic_shift : dword);
var
p : longint;
nc, delta, q1, r1, q2, r2 : dword;
begin
assert(d > 0);
magic_add := false;
nc := - 1 - (-d) mod d;
p := 31; { initialize p }
q1 := $80000000 div nc; { initialize q1 = 2p/nc }
r1 := $80000000 - q1*nc; { initialize r1 = rem(2p,nc) }
q2 := $7FFFFFFF div d; { initialize q2 = (2p-1)/d }
r2 := $7FFFFFFF - q2*d; { initialize r2 = rem((2p-1),d) }
repeat
inc(p);
if (r1 >= (nc - r1)) then begin
q1 := 2 * q1 + 1; { update q1 }
r1 := 2*r1 - nc; { update r1 }
end else begin
q1 := 2*q1; { update q1 }
r1 := 2*r1; { update r1 }
end;
if ((r2 + 1) >= (d - r2)) then begin
if (q2 >= $7FFFFFFF) then
magic_add := true;
q2 := 2*q2 + 1; { update q2 }
r2 := 2*r2 + 1 - d; { update r2 }
end else begin
if (q2 >= $80000000) then
magic_add := true;
q2 := 2*q2; { update q2 }
r2 := 2*r2 + 1; { update r2 }
end;
delta := d - 1 - r2;
until not ((p < 64) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
magic_m := q2 + 1; { resulting magic number }
magic_shift := p - 32; { resulting shift }
end;
procedure getmagic_signed32(d : longint; out magic_m : longint; out magic_s : longint);
const
two_31 : DWord = high(longint)+1;
var
p : Longint;
ad, anc, delta, q1, r1, q2, r2, t : DWord;
begin
assert((d < -1) or (d > 1));
ad := abs(d);
t := two_31 + (DWord(d) shr 31);
anc := t - 1 - t mod ad; { absolute value of nc }
p := 31; { initialize p }
q1 := two_31 div anc; { initialize q1 = 2p/abs(nc) }
r1 := two_31 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
q2 := two_31 div ad; { initialize q2 = 2p/abs(d) }
r2 := two_31 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
repeat
inc(p);
q1 := 2*q1; { update q1 = 2p/abs(nc) }
r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
if (r1 >= anc) then begin { must be unsigned comparison }
inc(q1);
dec(r1, anc);
end;
q2 := 2*q2; { update q2 = 2p/abs(d) }
r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
if (r2 >= ad) then begin { must be unsigned comparison }
inc(q2);
dec(r2, ad);
end;
delta := ad - r2;
until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
magic_m := q2 + 1;
if (d < 0) then begin
magic_m := -magic_m; { resulting magic number }
end;
magic_s := p - 32; { resulting shift }
end;
{ helper functions }
procedure getmagic_unsigned64(d : qword; out magic_m : qword; out magic_add : boolean; out magic_shift : qword);
const
two_63 : QWord = $8000000000000000;
var
p : int64;
nc, delta, q1, r1, q2, r2 : qword;
begin
assert(d > 0);
magic_add := false;
nc := - 1 - (-d) mod d;
p := 63; { initialize p }
q1 := two_63 div nc; { initialize q1 = 2p/nc }
r1 := two_63 - q1*nc; { initialize r1 = rem(2p,nc) }
q2 := (two_63-1) div d; { initialize q2 = (2p-1)/d }
r2 := (two_63-1) - q2*d; { initialize r2 = rem((2p-1),d) }
repeat
inc(p);
if (r1 >= (nc - r1)) then begin
q1 := 2 * q1 + 1; { update q1 }
r1 := 2*r1 - nc; { update r1 }
end else begin
q1 := 2*q1; { update q1 }
r1 := 2*r1; { update r1 }
end;
if ((r2 + 1) >= (d - r2)) then begin
if (q2 >= (two_63-1)) then
magic_add := true;
q2 := 2*q2 + 1; { update q2 }
r2 := 2*r2 + 1 - d; { update r2 }
end else begin
if (q2 >= two_63) then
magic_add := true;
q2 := 2*q2; { update q2 }
r2 := 2*r2 + 1; { update r2 }
end;
delta := d - 1 - r2;
until not ((p < 128) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
magic_m := q2 + 1; { resulting magic number }
magic_shift := p - 64; { resulting shift }
end;
procedure getmagic_signed64(d : int64; out magic_m : int64; out magic_s : int64);
const
two_63 : QWord = $8000000000000000;
var
p : int64;
ad, anc, delta, q1, r1, q2, r2, t : QWord;
begin
assert((d < -1) or (d > 1));
ad := abs(d);
t := two_63 + (QWord(d) shr 63);
anc := t - 1 - t mod ad; { absolute value of nc }
p := 63; { initialize p }
q1 := two_63 div anc; { initialize q1 = 2p/abs(nc) }
r1 := two_63 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
q2 := two_63 div ad; { initialize q2 = 2p/abs(d) }
r2 := two_63 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
repeat
inc(p);
q1 := 2*q1; { update q1 = 2p/abs(nc) }
r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
if (r1 >= anc) then begin { must be unsigned comparison }
inc(q1);
dec(r1, anc);
end;
q2 := 2*q2; { update q2 = 2p/abs(d) }
r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
if (r2 >= ad) then begin { must be unsigned comparison }
inc(q2);
dec(r2, ad);
end;
delta := ad - r2;
until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
magic_m := q2 + 1;
if (d < 0) then begin
magic_m := -magic_m; { resulting magic number }
end;
magic_s := p - 64; { resulting shift }
end;
{*****************************************************************************
TPPCMODDIVNODE
*****************************************************************************}
@ -70,6 +240,200 @@ begin
include(current_procinfo.flags, pi_do_call);
end;
procedure tppcmoddivnode.pass_2;
const { signed overflow }
divops: array[boolean, boolean] of tasmop =
((A_DIVDU,A_DIVDU_),(A_DIVD,A_DIVDO_));
zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond:C_NE; cr: RS_CR7);
var
power : longint;
op : tasmop;
numerator, divider,
resultreg : tregister;
size : TCgSize;
hl : tasmlabel;
done: boolean;
procedure genOrdConstNodeDiv;
const
negops : array[boolean] of tasmop = (A_NEG, A_NEGO);
var
magic, shift : int64;
u_magic, u_shift : qword;
u_add : boolean;
divreg : tregister;
begin
if (tordconstnode(right).value = 0) then begin
internalerror(2005061701);
end else if (tordconstnode(right).value = 1) then begin
cg.a_load_reg_reg(exprasmlist, OS_INT, OS_INT, numerator, resultreg);
end else if (tordconstnode(right).value = -1) then begin
{ note: only in the signed case possible..., may overflow }
exprasmlist.concat(taicpu.op_reg_reg(negops[cs_check_overflow in aktlocalswitches], resultreg, numerator));
end else if (ispowerof2(tordconstnode(right).value, power)) then begin
if (is_signed(right.resulttype.def)) then begin
{ From "The PowerPC Compiler Writer's Guide", pg. 52ff }
cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, power,
numerator, resultreg);
exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, resultreg, resultreg));
end else begin
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg)
end;
end else begin
{ replace division by multiplication, both implementations }
{ from "The PowerPC Compiler Writer's Guide" pg. 53ff }
divreg := cg.getintregister(exprasmlist, OS_INT);
if (is_signed(right.resulttype.def)) then begin
getmagic_signed64(tordconstnode(right).value, magic, shift);
{ load magic value }
cg.a_load_const_reg(exprasmlist, OS_INT, magic, divreg);
{ multiply }
exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHD, resultreg, numerator, divreg));
{ add/subtract numerator }
if (tordconstnode(right).value > 0) and (magic < 0) then begin
cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, numerator, resultreg, resultreg);
end else if (tordconstnode(right).value < 0) and (magic > 0) then begin
cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, numerator, resultreg, resultreg);
end;
{ shift shift places to the right (arithmetic) }
cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, shift, resultreg, resultreg);
{ extract and add sign bit }
if (tordconstnode(right).value >= 0) then begin
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, numerator, divreg);
end else begin
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, resultreg, divreg);
end;
cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, resultreg, divreg, resultreg);
end else begin
getmagic_unsigned64(tordconstnode(right).value, u_magic, u_add, u_shift);
{ load magic in divreg }
cg.a_load_const_reg(exprasmlist, OS_INT, u_magic, divreg);
exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHDU, resultreg, numerator, divreg));
if (u_add) then begin
cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, divreg);
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 1, divreg, divreg);
cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, divreg, resultreg, divreg);
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift-1, divreg, resultreg);
end else begin
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift, resultreg, resultreg);
end;
end;
end;
done := true;
end;
procedure genOrdConstNodeMod;
var
modreg, maskreg, tempreg : tregister;
begin
if (tordconstnode(right).value = 0) then begin
internalerror(2005061702);
end else if (abs(tordconstnode(right).value) = 1) then begin
{ x mod +/-1 is always zero }
cg.a_load_const_reg(exprasmlist, OS_INT, 0, resultreg);
end else if (ispowerof2(tordconstnode(right).value, power)) then begin
if (is_signed(right.resulttype.def)) then begin
tempreg := cg.getintregister(exprasmlist, OS_INT);
maskreg := cg.getintregister(exprasmlist, OS_INT);
modreg := cg.getintregister(exprasmlist, OS_INT);
cg.a_load_const_reg(exprasmlist, OS_INT, abs(tordconstnode(right).value)-1, modreg);
cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, 63, numerator, maskreg);
cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, numerator, modreg, tempreg);
exprasmlist.concat(taicpu.op_reg_reg_reg(A_ANDC, maskreg, maskreg, modreg));
exprasmlist.concat(taicpu.op_reg_reg_const(A_SUBFIC, modreg, tempreg, 0));
exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUBFE, modreg, modreg, modreg));
cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, modreg, maskreg, maskreg);
cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_INT, maskreg, tempreg, resultreg);
end else begin
cg.a_op_const_reg_reg(exprasmlist, OP_AND, OS_INT, tordconstnode(right).value-1, numerator, resultreg);
end;
end else begin
genOrdConstNodeDiv();
cg.a_op_const_reg_reg(exprasmlist, OP_MUL, OS_INT, tordconstnode(right).value, resultreg, resultreg);
cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, resultreg);
end;
end;
begin
secondpass(left);
secondpass(right);
location_copy(location,left.location);
{ put numerator in register }
size:=def_cgsize(left.resulttype.def);
location_force_reg(exprasmlist,left.location,
size,true);
location_copy(location,left.location);
numerator := location.register;
resultreg := location.register;
if (location.loc = LOC_CREGISTER) then begin
location.loc := LOC_REGISTER;
location.register := cg.getintregister(exprasmlist,size);
resultreg := location.register;
end else if (nodetype = modn) or (right.nodetype = ordconstn) then begin
{ for a modulus op, and for const nodes we need the result register
to be an extra register }
resultreg := cg.getintregister(exprasmlist,size);
end;
done := false;
(*
if (right.nodetype = ordconstn) then begin
if (nodetype = divn) then
genOrdConstNodeDiv
else
genOrdConstNodeMod;
done := true;
end;
*)
if (not done) then begin
{ load divider in a register if necessary }
location_force_reg(exprasmlist,right.location,
def_cgsize(right.resulttype.def),true);
if (right.nodetype <> ordconstn) then
exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR7,
right.location.register, 0))
else begin
if (tordconstnode(right).value = 0) then
internalerror(2005100301);
end;
divider := right.location.register;
{ needs overflow checking, (-maxlongint-1) div (-1) overflows! }
op := divops[is_signed(right.resulttype.def),
cs_check_overflow in aktlocalswitches];
exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
divider));
if (nodetype = modn) then begin
exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD,resultreg,
divider,resultreg));
exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB,location.register,
numerator,resultreg));
resultreg := location.register;
end;
end;
{ set result location }
location.loc:=LOC_REGISTER;
location.register:=resultreg;
if right.nodetype <> ordconstn then begin
objectlibrary.getjumplabel(hl);
exprasmlist.concat(taicpu.op_cond_sym(A_BC,zerocond,hl));
cg.a_call_name(exprasmlist,'FPC_DIVBYZERO');
cg.a_label(exprasmlist,hl);
end;
{ unsigned division/module can only overflow in case of division by zero
(but checking this overflow flag is more convoluted than performing a
simple comparison with 0) }
if is_signed(right.resulttype.def) then
cg.g_overflowcheck(exprasmlist,location,resulttype.def);
end;
(*
procedure tppcmoddivnode.pass_2;
const
// ts: todo, use 32 bit operations if possible (much faster!)
@ -130,9 +494,7 @@ begin
end else begin
cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg);
end;
end
else
begin
end else begin
{ load divider in a register if necessary }
location_force_reg(exprasmlist, right.location,
def_cgsize(right.resulttype.def), true);
@ -150,8 +512,7 @@ begin
exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
divider));
if (nodetype = modn) then
begin
if (nodetype = modn) then begin
{$NOTE ts:testme}
exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD, resultreg,
divider, resultreg));
@ -163,8 +524,7 @@ begin
{ set result location }
location.loc := LOC_REGISTER;
location.register := resultreg;
if right.nodetype <> ordconstn then
begin
if (right.nodetype <> ordconstn) then begin
objectlibrary.getjumplabel(hl);
exprasmlist.concat(taicpu.op_cond_sym(A_BC, zerocond, hl));
cg.a_call_name(exprasmlist, 'FPC_DIVBYZERO');
@ -172,7 +532,7 @@ begin
end;
cg.g_overflowcheck(exprasmlist, location, resulttype.def);
end;
*)
{*****************************************************************************
TPPCSHLRSHRNODE
*****************************************************************************}
@ -181,8 +541,8 @@ end;
procedure tppcshlshrnode.pass_2;
var
resultreg, hregister1, hregister2,
hreg64hi, hreg64lo: tregister;
resultreg, hregister1, hregister2 : tregister;
op: topcg;
asmop1, asmop2: tasmop;
shiftval: aint;
@ -199,7 +559,7 @@ begin
hregister1 := location.register;
if (location.loc = LOC_CREGISTER) then begin
location.loc := LOC_REGISTER;
resultreg := cg.getintregister(exprasmlist, OS_64);
resultreg := cg.getintregister(exprasmlist, OS_INT);
location.register := resultreg;
end;
@ -257,17 +617,14 @@ begin
end;
LOC_REFERENCE, LOC_CREFERENCE:
begin
if (left.resulttype.def.deftype = floatdef) then
begin
if (left.resulttype.def.deftype = floatdef) then begin
src1 := cg.getfpuregister(exprasmlist,
def_cgsize(left.resulttype.def));
location.register := src1;
cg.a_loadfpu_ref_reg(exprasmlist,
def_cgsize(left.resulttype.def),
left.location.reference, src1);
end
else
begin
end else begin
src1 := cg.getintregister(exprasmlist, OS_64);
location.register := src1;
cg.a_load_ref_reg(exprasmlist, OS_64, OS_64,
@ -276,28 +633,19 @@ begin
end;
end;
{ choose appropriate operand }
if left.resulttype.def.deftype <> floatdef then
begin
if left.resulttype.def.deftype <> floatdef then begin
if not (cs_check_overflow in aktlocalswitches) then
op := A_NEG
else
op := A_NEGO_;
location.loc := LOC_REGISTER;
end
else
begin
end else begin
op := A_FNEG;
location.loc := LOC_FPUREGISTER;
end;
{ emit operation }
exprasmlist.concat(taicpu.op_reg_reg(op, location.register, src1));
end;
{ Here was a problem... }
{ Operand to be negated always }
{ seems to be converted to signed }
{ 32-bit before doing neg!! }
{ So this is useless... }
{ that's not true: -2^31 gives an overflow error if it is negated (FK) }
cg.g_overflowcheck(exprasmlist, location, resulttype.def);
end;