fpc/compiler/i386/n386add.pas
florian 99b01f6634 * second mul might get called with a zero operand if the other has a side effect,
handle this correctly in ti386addnode.second_mul64bit, resolves #40182
2023-03-06 23:19:34 +01:00

673 lines
26 KiB
ObjectPascal

{
Copyright (c) 2000-2002 by Florian Klaempfl
Code generation for add nodes on the i386
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
****************************************************************************
}
unit n386add;
{$i fpcdefs.inc}
interface
uses
node,nadd,cpubase,nx86add;
type
ti386addnode = class(tx86addnode)
function use_generic_mul32to64: boolean; override;
function use_generic_mul64bit: boolean; override;
procedure second_addordinal; override;
procedure second_add64bit;override;
procedure second_cmp64bit;override;
procedure second_mul(unsigned: boolean);
procedure second_mul64bit;
protected
procedure set_mul_result_location;
end;
implementation
uses
globtype,systems,
cutils,verbose,globals,
symconst,symdef,paramgr,defutil,
aasmbase,aasmtai,aasmdata,aasmcpu,
cgbase,procinfo,
ncon,nset,cgutils,tgobj,
cpuinfo,
cga,ncgutil,cgobj,cg64f32,cgx86,
hlcgobj;
{*****************************************************************************
use_generic_mul32to64
*****************************************************************************}
function ti386addnode.use_generic_mul32to64: boolean;
begin
result := False;
end;
function ti386addnode.use_generic_mul64bit: boolean;
begin
result:=needoverflowcheck or
(cs_opt_size in current_settings.optimizerswitches);
end;
{ handles all unsigned multiplications, and 32->64 bit signed ones.
32bit-only signed mul is handled by generic codegen }
procedure ti386addnode.second_addordinal;
var
unsigned: boolean;
begin
unsigned:=not(is_signed(left.resultdef)) or
not(is_signed(right.resultdef));
{ use IMUL instead of MUL in case overflow checking is off and we're
doing a 32->32-bit multiplication }
if not needoverflowcheck and
not is_64bit(resultdef) then
unsigned:=false;
if (nodetype=muln) and (unsigned or is_64bit(resultdef)) then
second_mul(unsigned)
else
inherited second_addordinal;
end;
{*****************************************************************************
Add64bit
*****************************************************************************}
procedure ti386addnode.second_add64bit;
var
op : TOpCG;
op1,op2 : TAsmOp;
opsize : TOpSize;
hregister,
hregister2 : tregister;
hl4 : tasmlabel;
mboverflow,
unsigned:boolean;
r:Tregister;
begin
pass_left_right;
op1:=A_NONE;
op2:=A_NONE;
mboverflow:=false;
opsize:=S_L;
unsigned:=((left.resultdef.typ=orddef) and
(torddef(left.resultdef).ordtype=u64bit)) or
((right.resultdef.typ=orddef) and
(torddef(right.resultdef).ordtype=u64bit));
case nodetype of
addn :
begin
op:=OP_ADD;
mboverflow:=true;
end;
subn :
begin
op:=OP_SUB;
op1:=A_SUB;
op2:=A_SBB;
mboverflow:=true;
end;
xorn:
op:=OP_XOR;
orn:
op:=OP_OR;
andn:
op:=OP_AND;
muln:
begin
second_mul64bit;
exit;
end
else
begin
{ everything should be handled in pass_1 (JM) }
internalerror(2001090505);
end;
end;
{ left and right no register? }
{ then one must be demanded }
if (left.location.loc<>LOC_REGISTER) then
begin
if (right.location.loc<>LOC_REGISTER) then
begin
hregister:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
hregister2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg64.a_load64_loc_reg(current_asmdata.CurrAsmList,left.location,joinreg64(hregister,hregister2));
location_reset(left.location,LOC_REGISTER,left.location.size);
left.location.register64.reglo:=hregister;
left.location.register64.reghi:=hregister2;
end
else
begin
location_swap(left.location,right.location);
toggleflag(nf_swapped);
end;
end;
{ at this point, left.location.loc should be LOC_REGISTER }
if right.location.loc=LOC_REGISTER then
begin
if mboverflow and needoverflowcheck then
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
{ when swapped another result register }
if (nodetype=subn) and (nf_swapped in flags) then
begin
cg64.a_op64_reg_reg(current_asmdata.CurrAsmList,op,location.size,
left.location.register64,
right.location.register64);
location_swap(left.location,right.location);
toggleflag(nf_swapped);
end
else
begin
cg64.a_op64_reg_reg(current_asmdata.CurrAsmList,op,location.size,
right.location.register64,
left.location.register64);
end;
end
else
begin
{ right.location<>LOC_REGISTER }
if (nodetype=subn) and (nf_swapped in flags) then
begin
r:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg64.a_load64low_loc_reg(current_asmdata.CurrAsmList,right.location,r);
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
emit_reg_reg(op1,opsize,left.location.register64.reglo,r);
emit_reg_reg(A_MOV,opsize,r,left.location.register64.reglo);
cg64.a_load64high_loc_reg(current_asmdata.CurrAsmList,right.location,r);
{ the carry flag is still ok }
emit_reg_reg(op2,opsize,left.location.register64.reghi,r);
{ We need to keep the FLAGS register allocated for overflow checks }
if not mboverflow or not needoverflowcheck then
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
emit_reg_reg(A_MOV,opsize,r,left.location.register64.reghi);
end
else
begin
if mboverflow and needoverflowcheck then
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
cg64.a_op64_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,
left.location.register64);
end;
location_freetemp(current_asmdata.CurrAsmList,right.location);
end;
{ only in case of overflow operations }
{ produce overflow code }
{ we must put it here directly, because sign of operation }
{ is in unsigned VAR!! }
if mboverflow then
begin
if needoverflowcheck then
begin
current_asmdata.getjumplabel(hl4);
if unsigned then
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4)
else
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl4);
cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
cg.a_label(current_asmdata.CurrAsmList,hl4);
end;
end;
location_copy(location,left.location);
end;
procedure ti386addnode.second_cmp64bit;
var
truelabel,
falselabel,
hlab : tasmlabel;
href : treference;
unsigned : boolean;
procedure firstjmp64bitcmp;
var
oldnodetype : tnodetype;
begin
{ the jump the sequence is a little bit hairy }
case nodetype of
ltn,gtn:
begin
if (hlab<>location.truelabel) then
cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),location.truelabel);
{ cheat a little bit for the negative test }
toggleflag(nf_swapped);
if (hlab<>location.falselabel) then
cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),location.falselabel);
toggleflag(nf_swapped);
end;
lten,gten:
begin
oldnodetype:=nodetype;
if nodetype=lten then
nodetype:=ltn
else
nodetype:=gtn;
if (hlab<>location.truelabel) then
cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),location.truelabel);
{ cheat for the negative test }
if nodetype=ltn then
nodetype:=gtn
else
nodetype:=ltn;
if (hlab<>location.falselabel) then
cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),location.falselabel);
nodetype:=oldnodetype;
end;
equaln:
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NE,location.falselabel);
unequaln:
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NE,location.truelabel);
else
internalerror(2019050905);
end;
end;
procedure secondjmp64bitcmp;
begin
{ the jump the sequence is a little bit hairy }
case nodetype of
ltn,gtn,lten,gten:
begin
{ the comparisaion of the low dword have to be }
{ always unsigned! }
cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(true),location.truelabel);
cg.a_jmp_always(current_asmdata.CurrAsmList,location.falselabel);
end;
equaln:
begin
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NE,location.falselabel);
cg.a_jmp_always(current_asmdata.CurrAsmList,location.truelabel);
end;
unequaln:
begin
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NE,location.truelabel);
cg.a_jmp_always(current_asmdata.CurrAsmList,location.falselabel);
end;
else
internalerror(2019050904);
end;
end;
begin
truelabel:=nil;
falselabel:=nil;
pass_left_right;
unsigned:=((left.resultdef.typ=orddef) and
(torddef(left.resultdef).ordtype=u64bit)) or
((right.resultdef.typ=orddef) and
(torddef(right.resultdef).ordtype=u64bit));
{ we have LOC_JUMP as result }
current_asmdata.getjumplabel(truelabel);
current_asmdata.getjumplabel(falselabel);
location_reset_jump(location,truelabel,falselabel);
{ Relational compares against constants having low dword=0 can omit the
second compare based on the fact that any unsigned value is >=0 }
hlab:=nil;
if (right.location.loc=LOC_CONSTANT) and
(lo(right.location.value64)=0) then
begin
case getresflags(true) of
F_AE: hlab:=location.truelabel ;
F_B: hlab:=location.falselabel;
else
;
end;
end;
if (right.location.loc=LOC_CONSTANT) and
(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
href:=left.location.reference;
inc(href.offset,4);
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
emit_const_ref(A_CMP,S_L,aint(hi(right.location.value64)),href);
firstjmp64bitcmp;
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
if assigned(hlab) then
cg.a_jmp_always(current_asmdata.CurrAsmList,hlab)
else
begin
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
emit_const_ref(A_CMP,S_L,aint(lo(right.location.value64)),left.location.reference);
secondjmp64bitcmp;
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
end;
location_freetemp(current_asmdata.CurrAsmList,left.location);
exit;
end;
{ left and right no register? }
{ then one must be demanded }
if not (left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
begin
if not (right.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true)
else
begin
location_swap(left.location,right.location);
toggleflag(nf_swapped);
end;
end;
{ at this point, left.location.loc should be LOC_[C]REGISTER }
case right.location.loc of
LOC_REGISTER,
LOC_CREGISTER :
begin
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
emit_reg_reg(A_CMP,S_L,right.location.register64.reghi,left.location.register64.reghi);
firstjmp64bitcmp;
emit_reg_reg(A_CMP,S_L,right.location.register64.reglo,left.location.register64.reglo);
secondjmp64bitcmp;
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
end;
LOC_CREFERENCE,
LOC_REFERENCE :
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
href:=right.location.reference;
inc(href.offset,4);
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
emit_ref_reg(A_CMP,S_L,href,left.location.register64.reghi);
firstjmp64bitcmp;
emit_ref_reg(A_CMP,S_L,right.location.reference,left.location.register64.reglo);
secondjmp64bitcmp;
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
location_freetemp(current_asmdata.CurrAsmList,right.location);
end;
LOC_CONSTANT :
begin
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_CMP,S_L,aint(hi(right.location.value64)),left.location.register64.reghi));
firstjmp64bitcmp;
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
if assigned(hlab) then
cg.a_jmp_always(current_asmdata.CurrAsmList,hlab)
else
begin
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_CMP,S_L,aint(lo(right.location.value64)),left.location.register64.reglo));
secondjmp64bitcmp;
cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
end;
end;
else
internalerror(2002032803);
end;
end;
{*****************************************************************************
x86 MUL
*****************************************************************************}
procedure ti386addnode.set_mul_result_location;
begin
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
{Free EAX,EDX}
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
if is_64bit(resultdef) then
begin
{Allocate a couple of registers and store EDX:EAX into it}
location.register64.reghi := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EDX, location.register64.reghi);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
location.register64.reglo := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EAX, location.register64.reglo);
end
else
begin
{Allocate a new register and store the result in EAX in it.}
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EAX,location.register);
end;
location_freetemp(current_asmdata.CurrAsmList,left.location);
location_freetemp(current_asmdata.CurrAsmList,right.location);
end;
procedure ti386addnode.second_mul(unsigned: boolean);
var reg,reghi,reglo:Tregister;
ref:Treference;
use_ref:boolean;
hl4 : tasmlabel;
const
asmops: array[boolean] of tasmop = (A_IMUL, A_MUL);
begin
pass_left_right;
reg:=NR_NO;
reference_reset(ref,sizeof(pint),[]);
{ Mul supports registers and references, so if not register/reference,
load the location into a register.
The variant of IMUL which is capable of doing 32->64 bits has the same restrictions. }
use_ref:=false;
if left.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
reg:=left.location.register
else if left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
ref:=left.location.reference;
use_ref:=true;
end
else
begin
{ LOC_CONSTANT for example.}
reg:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,left.resultdef,osuinttype,left.location,reg);
end;
if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and
(not(needoverflowcheck) or
{ 32->64 bit cannot overflow }
is_64bit(resultdef)) then
begin
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,osuinttype,right.location,NR_EDX);
reglo:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
reghi:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
if use_ref then
current_asmdata.CurrAsmList.concat(Taicpu.Op_ref_reg_reg(A_MULX,S_L,ref,reglo,reghi))
else
emit_reg_reg_reg(A_MULX,S_L,reg,reglo,reghi);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
location.register64.reglo:=reglo;
if is_64bit(resultdef) then
location.register64.reghi:=reghi;
location_freetemp(current_asmdata.CurrAsmList,left.location);
location_freetemp(current_asmdata.CurrAsmList,right.location);
end
else
begin
{ Allocate EAX. }
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
{ Load the right value. }
hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,osuinttype,right.location,NR_EAX);
{ Also allocate EDX, since it is also modified by a mul (JM). }
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
if needoverflowcheck then
cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
if use_ref then
emit_ref(asmops[unsigned],S_L,ref)
else
emit_reg(asmops[unsigned],S_L,reg);
if needoverflowcheck and
{ 32->64 bit cannot overflow }
(not is_64bit(resultdef)) then
begin
current_asmdata.getjumplabel(hl4);
cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4);
cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
cg.a_label(current_asmdata.CurrAsmList,hl4);
end;
set_mul_result_location;
end;
end;
procedure ti386addnode.second_mul64bit;
var
list: TAsmList;
hreg1,hreg2: tregister;
begin
{ 64x64 multiplication yields 128-bit result, but we're only
interested in its lower 64 bits. This lower part is independent
of operand signs, and so is the generated code. }
{ pass_left_right already called from second_add64bit }
list:=current_asmdata.CurrAsmList;
if left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
tcgx86(cg).make_simple_ref(list,left.location.reference);
if right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
tcgx86(cg).make_simple_ref(list,right.location.reference);
{ calculate 32-bit terms lo(right)*hi(left) and hi(left)*lo(right) }
if (right.location.loc=LOC_CONSTANT) then
begin
{ if left has side effects, it could be that this code is called with right.location.value64=0,
see also #40182 }
if right.location.value64=0 then
begin
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
location.register64.reglo := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
emit_const_reg(A_MOV,S_L,0,location.register64.reglo);
location.register64.reghi := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
emit_const_reg(A_MOV,S_L,0,location.register64.reghi);
exit;
end;
{ Omit zero terms, if any }
hreg1:=NR_NO;
hreg2:=NR_NO;
if lo(right.location.value64)<>0 then
hreg1:=cg.getintregister(list,OS_INT);
if hi(right.location.value64)<>0 then
hreg2:=cg.getintregister(list,OS_INT);
{ Take advantage of 3-operand form of IMUL }
case left.location.loc of
LOC_REGISTER,LOC_CREGISTER:
begin
if hreg1<>NR_NO then
emit_const_reg_reg(A_IMUL,S_L,longint(lo(right.location.value64)),left.location.register64.reghi,hreg1);
if hreg2<>NR_NO then
emit_const_reg_reg(A_IMUL,S_L,longint(hi(right.location.value64)),left.location.register64.reglo,hreg2);
end;
LOC_REFERENCE,LOC_CREFERENCE:
begin
if hreg2<>NR_NO then
list.concat(taicpu.op_const_ref_reg(A_IMUL,S_L,longint(hi(right.location.value64)),left.location.reference,hreg2));
inc(left.location.reference.offset,4);
if hreg1<>NR_NO then
list.concat(taicpu.op_const_ref_reg(A_IMUL,S_L,longint(lo(right.location.value64)),left.location.reference,hreg1));
dec(left.location.reference.offset,4);
end;
else
InternalError(2014011602);
end;
end
else
begin
hreg1:=cg.getintregister(list,OS_INT);
hreg2:=cg.getintregister(list,OS_INT);
cg64.a_load64low_loc_reg(list,left.location,hreg1);
cg64.a_load64high_loc_reg(list,left.location,hreg2);
case right.location.loc of
LOC_REGISTER,LOC_CREGISTER:
begin
emit_reg_reg(A_IMUL,S_L,right.location.register64.reghi,hreg1);
emit_reg_reg(A_IMUL,S_L,right.location.register64.reglo,hreg2);
end;
LOC_REFERENCE,LOC_CREFERENCE:
begin
emit_ref_reg(A_IMUL,S_L,right.location.reference,hreg2);
inc(right.location.reference.offset,4);
emit_ref_reg(A_IMUL,S_L,right.location.reference,hreg1);
dec(right.location.reference.offset,4);
end;
else
InternalError(2014011603);
end;
end;
{ add hi*lo and lo*hi terms together }
if (hreg1<>NR_NO) and (hreg2<>NR_NO) then
emit_reg_reg(A_ADD,S_L,hreg2,hreg1);
{ load lo(right) into EAX }
cg.getcpuregister(list,NR_EAX);
cg64.a_load64low_loc_reg(list,right.location,NR_EAX);
{ multiply EAX by lo(left), producing 64-bit value in EDX:EAX }
cg.getcpuregister(list,NR_EDX);
if (left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
emit_reg(A_MUL,S_L,left.location.register64.reglo)
else if (left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
emit_ref(A_MUL,S_L,left.location.reference)
else
InternalError(2014011604);
{ add previously calculated terms to the high half }
if (hreg1<>NR_NO) then
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX)
else if (hreg2<>NR_NO) then
emit_reg_reg(A_ADD,S_L,hreg2,NR_EDX)
else
InternalError(2014011601);
{ Result is now in EDX:EAX. Copy it to virtual registers. }
set_mul_result_location;
end;
begin
caddnode:=ti386addnode;
end.