{ Copyright (c) 2000-2002 by Florian Klaempfl Common code generation for add nodes on the i386 and x86 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. **************************************************************************** } unit nx86add; {$i fpcdefs.inc} interface uses symtype, cgbase, cpubase, node,nadd,ncgadd; type tx86addnode = class(tcgaddnode) protected function getresflags(unsigned : boolean) : tresflags; function getfpuresflags : tresflags; procedure left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean); procedure force_left_and_right_fpureg; procedure prepare_x87_locations(out refnode: tnode); procedure emit_op_right_left(op:TAsmOp;opsize:TCgSize;AllocFlags:boolean); procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean); procedure second_cmpfloatvector; procedure second_addfloatsse; procedure second_addfloatavx; public function pass_1 : tnode;override; function simplify(forinline : boolean) : tnode; override; function use_fma : boolean;override; procedure second_addfloat;override; {$ifndef i8086} procedure second_addsmallset;override; procedure second_addsmallsetelement;override; {$endif not i8086} procedure second_add64bit;override; procedure second_cmpfloat;override; procedure second_cmpsmallset;override; procedure second_cmp64bit;override; procedure second_cmpordinal;override; procedure second_addordinal;override; procedure second_addboolean;override; {$ifdef SUPPORT_MMX} procedure second_opmmx;override; {$endif SUPPORT_MMX} procedure second_opvector;override; end; implementation uses globtype,globals, verbose,cutils,compinnr, cpuinfo, aasmbase,aasmdata,aasmcpu, symconst,symdef, cgobj,hlcgobj,cgx86,cga,cgutils, tgobj,ncgutil,nutils, ncon,nset,ninl,ncnv,ncal,nmat, defutil,defcmp,constexp, pass_1,pass_2,htypechk; { Range check must be disabled explicitly as the code serves on three different architecture sizes } {$R-} {***************************************************************************** Helpers *****************************************************************************} procedure tx86addnode.emit_generic_code(op:TAsmOp;opsize:TCGSize;unsigned,extra_not,mboverflow:boolean); var power : longint; hl4 : tasmlabel; r : Tregister; href : treference; overflowcheck: boolean; comparison: boolean; begin overflowcheck:=needoverflowcheck; comparison:= (op=A_CMP) or (op=A_TEST) or (op=A_BT) or is_boolean(resultdef); { at this point, left.location.loc should be LOC_REGISTER } if right.location.loc=LOC_REGISTER then begin { right.location is a LOC_REGISTER } { when swapped another result register } if (nodetype=subn) and (nf_swapped in flags) then begin if extra_not then emit_reg(A_NOT,TCGSize2Opsize[opsize],left.location.register); emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,right.location.register); { newly swapped also set swapped flag } location_swap(left.location,right.location); toggleflag(nf_swapped); end else begin if extra_not then emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register); if (op=A_ADD) or (op=A_OR) or (op=A_AND) or (op=A_XOR) or (op=A_IMUL) then location_swap(left.location,right.location); if comparison then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); emit_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register); end; end else begin { right.location is not a LOC_REGISTER } if (nodetype=subn) and (nf_swapped in flags) then begin if extra_not then cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NOT,opsize,left.location.register,left.location.register); r:=cg.getintregister(current_asmdata.CurrAsmList,opsize); hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r); if comparison then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,r); cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,r,left.location.register); end else begin { Optimizations when right.location is a constant value } if (op=A_CMP) and (nodetype in [equaln,unequaln]) and (right.location.loc=LOC_CONSTANT) and (right.location.value=0) then begin { 'test $-1,%reg' is transformable into 'test $-1,spilltemp' if %reg needs spilling, while 'test %reg,%reg' still requires loading into register. If spilling is not necessary, it is changed back into 'test %reg,%reg' by peephole optimizer (this optimization is currently available only for i386). } cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS); {$ifdef i386} emit_const_reg(A_TEST,TCGSize2Opsize[opsize],aint(-1),left.location.register) {$else i386} emit_reg_reg(A_TEST,TCGSize2Opsize[opsize],left.location.register,left.location.register); {$endif i386} end else if (op=A_ADD) and (right.location.loc=LOC_CONSTANT) and (right.location.value=1) and not overflowcheck and UseIncDec then begin emit_reg(A_INC,TCGSize2Opsize[opsize],left.location.register); end else if (op=A_SUB) and (right.location.loc=LOC_CONSTANT) and (right.location.value=1) and not overflowcheck and UseIncDec then begin emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register); end else if (op=A_IMUL) and (right.location.loc=LOC_CONSTANT) and (ispowerof2(int64(right.location.value),power)) and overflowcheck then begin emit_const_reg(A_SHL,TCGSize2Opsize[opsize],power,left.location.register); end else if (op=A_IMUL) and (right.location.loc=LOC_CONSTANT) and (right.location.value>1) and (ispowerof2(int64(right.location.value)-1,power)) and (power in [1..3]) and not overflowcheck then begin reference_reset_base(href,left.location.register,0,ctempposinvalid,0,[]); href.index:=left.location.register; href.scalefactor:=int64(right.location.value)-1; left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize); current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_LEA,TCgSize2OpSize[opsize],href,left.location.register)); end else begin if extra_not then begin r:=cg.getintregister(current_asmdata.CurrAsmList,opsize); hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r); emit_reg(A_NOT,TCGSize2Opsize[opsize],r); if comparison or (mboverflow and overflowcheck) then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); emit_reg_reg(A_AND,TCGSize2Opsize[opsize],r,left.location.register); end else emit_op_right_left(op,opsize,comparison or (mboverflow and overflowcheck)); end; end; end; { only in case of overflow operations } { produce overflow code } { we must put it here directly, because sign of operation } { is in unsigned VAR!! } if mboverflow then begin if overflowcheck then begin current_asmdata.getjumplabel(hl4); if unsigned then cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4) else cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl4); if not comparison then cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false); cg.a_label(current_asmdata.CurrAsmList,hl4); end; end; end; procedure tx86addnode.left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean); begin { left location is not a register? } if (left.location.loc<>LOC_REGISTER) then begin { if right is register then we can swap the locations } if (not noswap) and (right.location.loc=LOC_REGISTER) then begin location_swap(left.location,right.location); toggleflag(nf_swapped); end else if (not noswap) and (right.location.loc=LOC_CREGISTER) then begin location_swap(left.location,right.location); toggleflag(nf_swapped); { maybe we can reuse a constant register when the operation is a comparison that doesn't change the value of the register } hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln])); location:=left.location; end else begin { maybe we can reuse a constant register when the operation is a comparison that doesn't change the value of the register } hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln])); end; end; if (right.location.loc<>LOC_CONSTANT) and (tcgsize2unsigned[right.location.size]<>tcgsize2unsigned[opsize]) then hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true); if (left.location.loc<>LOC_CONSTANT) and (tcgsize2unsigned[left.location.size]<>tcgsize2unsigned[opsize]) then hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false); end; procedure tx86addnode.force_left_and_right_fpureg; begin if (right.location.loc<>LOC_FPUREGISTER) then begin hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false); if (left.location.loc<>LOC_FPUREGISTER) then hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false) else { left was on the stack => swap } toggleflag(nf_swapped); end { the nominator in st0 } else if (left.location.loc<>LOC_FPUREGISTER) then begin hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false) end else begin { fpu operands are always in the wrong order on the stack } toggleflag(nf_swapped); end; end; { Makes sides suitable for executing an x87 instruction: if either side is OS_F32/OS_F64-sized LOC_REFERENCE, it is returned in 'refnode' everything else is loaded to FPU stack. } procedure tx86addnode.prepare_x87_locations(out refnode: tnode); begin refnode:=nil; { later on, no mm registers are allowed, so transfer everything to memory here below it is loaded into an fpu register if neede } if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef); if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef); case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of 0: begin hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false); if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then InternalError(2013090803); if (left.location.size in [OS_F32,OS_F64]) then begin refnode:=left; toggleflag(nf_swapped); end else hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false); end; 1: begin { if left is on the stack then swap. } if (left.location.loc=LOC_FPUREGISTER) then refnode:=right else refnode:=left; if not(refnode.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then InternalError(2013090801); if not (refnode.location.size in [OS_F32,OS_F64]) then begin hlcg.location_force_fpureg(current_asmdata.CurrAsmList,refnode.location,refnode.resultdef,false); if (refnode=right) then toggleflag(nf_swapped); refnode:=nil; end else begin if (refnode=left) then toggleflag(nf_swapped); end; end; 2: { fpu operands are always in the wrong order on the stack } toggleflag(nf_swapped); else InternalError(2013090802); end; end; procedure tx86addnode.emit_op_right_left(op:TAsmOp;opsize:TCgsize;AllocFlags:boolean); {$ifdef x86_64} var tmpreg : tregister; {$endif x86_64} begin if (right.location.loc in [LOC_CSUBSETREG,LOC_SUBSETREG,LOC_SUBSETREF,LOC_CSUBSETREF]) then hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true); { left must be a register } case right.location.loc of LOC_REGISTER, LOC_CREGISTER : begin if AllocFlags then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register)); end; LOC_REFERENCE, LOC_CREFERENCE : begin tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference); if AllocFlags then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,TCGSize2Opsize[opsize],right.location.reference,left.location.register)); end; LOC_CONSTANT : begin {$ifdef x86_64} { x86_64 only supports signed 32 bits constants directly } if (opsize in [OS_S64,OS_64]) and ((right.location.valuehigh(longint))) then begin tmpreg:=cg.getintregister(current_asmdata.CurrAsmList,opsize); cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,right.location.value,tmpreg); if AllocFlags then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],tmpreg,left.location.register)); end else {$endif x86_64} begin if AllocFlags then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(op,TCGSize2Opsize[opsize],right.location.value,left.location.register)); end; end; else internalerror(200203232); end; end; function tx86addnode.getresflags(unsigned : boolean) : tresflags; begin case nodetype of equaln : getresflags:=F_E; unequaln : getresflags:=F_NE; else if not(unsigned) then begin if nf_swapped in flags then case nodetype of ltn : getresflags:=F_G; lten : getresflags:=F_GE; gtn : getresflags:=F_L; gten : getresflags:=F_LE; else internalerror(2013120105); end else case nodetype of ltn : getresflags:=F_L; lten : getresflags:=F_LE; gtn : getresflags:=F_G; gten : getresflags:=F_GE; else internalerror(2013120106); end; end else begin if nf_swapped in flags then case nodetype of ltn : getresflags:=F_A; lten : getresflags:=F_AE; gtn : getresflags:=F_B; gten : getresflags:=F_BE; else internalerror(2013120107); end else case nodetype of ltn : getresflags:=F_B; lten : getresflags:=F_BE; gtn : getresflags:=F_A; gten : getresflags:=F_AE; else internalerror(2013120108); end; end; end; end; function tx86addnode.getfpuresflags : tresflags; begin if (nodetype=equaln) then result:=F_FE else if (nodetype=unequaln) then result:=F_FNE else if (nf_swapped in flags) then case nodetype of ltn : result:=F_FA; lten : result:=F_FAE; gtn : result:=F_FB; gten : result:=F_FBE; else internalerror(2014031402); end else case nodetype of ltn : result:=F_FB; lten : result:=F_FBE; gtn : result:=F_FA; gten : result:=F_FAE; else internalerror(2014031403); end; end; {***************************************************************************** AddSmallSet *****************************************************************************} {$ifndef i8086} procedure tx86addnode.second_addsmallset; var setbase : aint; opdef : tdef; opsize : TCGSize; op : TAsmOp; extra_not, noswap : boolean; all_member_optimization:boolean; begin pass_left_right; noswap:=false; extra_not:=false; all_member_optimization:=false; opdef:=resultdef; opsize:=int_cgsize(opdef.size); if (left.resultdef.typ=setdef) then setbase:=tsetdef(left.resultdef).setbase else setbase:=tsetdef(right.resultdef).setbase; case nodetype of addn : begin { adding elements is not commutative } if (nf_swapped in flags) and (left.nodetype=setelementn) then swapleftright; { are we adding set elements ? } if right.nodetype=setelementn then begin { no range support for smallsets! } if assigned(tsetelementnode(right).right) then internalerror(43244); { btsb isn't supported } if opsize=OS_8 then begin opsize:=OS_32; opdef:=u32inttype; end; { bts requires both elements to be registers } hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false); hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true); register_maybe_adjust_setbase(current_asmdata.CurrAsmList,opdef,right.location,setbase); op:=A_BTS; noswap:=true; end else op:=A_OR; end; symdifn : op:=A_XOR; muln : op:=A_AND; subn : begin op:=A_AND; if (not(nf_swapped in flags) and (left.location.loc=LOC_CONSTANT) and (left.location.value=-1)) or ((nf_swapped in flags) and (right.location.loc=LOC_CONSTANT) and (right.location.value=-1)) then all_member_optimization:=true; if (not(nf_swapped in flags)) and (right.location.loc=LOC_CONSTANT) then right.location.value := not(right.location.value) else if (nf_swapped in flags) and (left.location.loc=LOC_CONSTANT) then left.location.value := not(left.location.value) else extra_not:=true; end; xorn : op:=A_XOR; orn : op:=A_OR; andn : op:=A_AND; else internalerror(2003042215); end; if all_member_optimization then begin {A set expression [0..31]-x can be implemented with a simple NOT.} if nf_swapped in flags then begin { newly swapped also set swapped flag } location_swap(left.location,right.location); toggleflag(nf_swapped); end; hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,false); emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register); location:=right.location; end else begin { can we use the BMI1 instruction andn? } if (op=A_AND) and extra_not and (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) and (resultdef.size in [4{$ifdef x86_64},8{$endif x86_64}]) then begin location_reset(location,LOC_REGISTER,left.location.size); location.register:=cg.getintregister(current_asmdata.currAsmList,left.location.size); if nf_swapped in flags then begin location_swap(left.location,right.location); toggleflag(nf_swapped); end; hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,true); if not(left.location.loc in [LOC_CREGISTER,LOC_REGISTER,LOC_CREFERENCE,LOC_REFERENCE]) then hlcg.location_force_reg(current_asmdata.currAsmList,left.location,left.resultdef,opdef,true); case left.location.loc of LOC_CREGISTER,LOC_REGISTER: emit_reg_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.register,right.location.register,location.register); LOC_CREFERENCE,LOC_REFERENCE: emit_ref_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.reference,right.location.register,location.register); else Internalerror(2018040201); end; end else begin { left must be a register } left_must_be_reg(opdef,opsize,noswap); emit_generic_code(op,opsize,true,extra_not,false); location_freetemp(current_asmdata.CurrAsmList,right.location); { left is always a register and contains the result } location:=left.location; end; end; { fix the changed opsize we did above because of the missing btsb } if opsize<>int_cgsize(resultdef.size) then hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,cgsize_orddef(int_cgsize(resultdef.size)),false); end; procedure tx86addnode.second_addsmallsetelement; var setbase, mask: aint; begin if resultdef.size=1 then inherited second_addsmallsetelement else begin if nodetype<>addn then internalerror(2022090502); { no range support for smallsets } if assigned(tsetelementnode(right).right) then internalerror(2022090501); pass_left_right; { setelementn is a special case, it must be on right } if (nf_swapped in flags) and (left.nodetype=setelementn) then swapleftright; force_reg_left_right(false,false); set_result_location_reg; setbase:=tsetdef(left.resultdef).setbase; if (right.location.loc = LOC_CONSTANT) then begin mask:=aint(1 shl (right.location.value-setbase)); hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_OR,resultdef, mask,left.location.register,location.register); end else begin hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,resultdef,true); register_maybe_adjust_setbase(current_asmdata.CurrAsmList,resultdef,right.location,setbase); if left.location.loc <> LOC_CONSTANT then hlcg.a_load_reg_reg(current_asmdata.CurrAsmList,left.resultdef,resultdef, left.location.register,location.register) else hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef, left.location.value,location.register); emit_reg_reg(A_BTS,TCGSize2Opsize[def_cgsize(resultdef)],right.location.register,location.register); end; end; end; {$endif not i8086} procedure tx86addnode.second_cmpsmallset; var opdef : tdef; opsize : TCGSize; op : TAsmOp; begin pass_left_right; opdef:=left.resultdef; opsize:=int_cgsize(opdef.size); case nodetype of equaln, unequaln : op:=A_CMP; lten,gten: begin if (not(nf_swapped in flags) and (nodetype = lten)) or ((nf_swapped in flags) and (nodetype = gten)) then swapleftright; hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false); emit_op_right_left(A_AND,opsize,False); op:=A_CMP; { warning: ugly hack, we need a JE so change the node to equaln } nodetype:=equaln; end; else internalerror(2003042204); end; { left must be a register } left_must_be_reg(opdef,opsize,false); emit_generic_code(op,opsize,true,false,false); location_freetemp(current_asmdata.CurrAsmList,right.location); location_freetemp(current_asmdata.CurrAsmList,left.location); location_reset(location,LOC_FLAGS,OS_NO); location.resflags:=getresflags(true); end; {***************************************************************************** AddMMX *****************************************************************************} {$ifdef SUPPORT_MMX} procedure tx86addnode.second_opmmx; var op : TAsmOp; cmpop : boolean; mmxbase : tmmxtype; hreg, hregister : tregister; begin pass_left_right; cmpop:=false; op:=A_NOP; mmxbase:=mmx_type(left.resultdef); location_reset(location,LOC_MMXREGISTER,def_cgsize(resultdef)); case nodetype of addn : begin if (cs_mmx_saturation in current_settings.localswitches) then begin case mmxbase of mmxs8bit: op:=A_PADDSB; mmxu8bit: op:=A_PADDUSB; mmxs16bit,mmxfixed16: op:=A_PADDSW; mmxu16bit: op:=A_PADDUSW; else ; end; end else begin case mmxbase of mmxs8bit,mmxu8bit: op:=A_PADDB; mmxs16bit,mmxu16bit,mmxfixed16: op:=A_PADDW; mmxs32bit,mmxu32bit: op:=A_PADDD; else ; end; end; end; muln : begin case mmxbase of mmxs16bit,mmxu16bit: op:=A_PMULLW; mmxfixed16: op:=A_PMULHW; else ; end; end; subn : begin if (cs_mmx_saturation in current_settings.localswitches) then begin case mmxbase of mmxs8bit: op:=A_PSUBSB; mmxu8bit: op:=A_PSUBUSB; mmxs16bit,mmxfixed16: op:=A_PSUBSB; mmxu16bit: op:=A_PSUBUSW; else ; end; end else begin case mmxbase of mmxs8bit,mmxu8bit: op:=A_PSUBB; mmxs16bit,mmxu16bit,mmxfixed16: op:=A_PSUBW; mmxs32bit,mmxu32bit: op:=A_PSUBD; else ; end; end; end; xorn: op:=A_PXOR; orn: op:=A_POR; andn: op:=A_PAND; else internalerror(2003042214); end; if op = A_NOP then internalerror(201408201); { left and right no register? } { then one must be demanded } if (left.location.loc<>LOC_MMXREGISTER) then begin if (right.location.loc=LOC_MMXREGISTER) then begin location_swap(left.location,right.location); toggleflag(nf_swapped); end else begin { register variable ? } if (left.location.loc=LOC_CMMXREGISTER) then begin hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList); emit_reg_reg(A_MOVQ,S_NO,left.location.register,hregister); end else begin if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then internalerror(200203245); hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList); tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference); emit_ref_reg(A_MOVQ,S_NO,left.location.reference,hregister); end; location_reset(left.location,LOC_MMXREGISTER,OS_NO); left.location.register:=hregister; end; end; { at this point, left.location.loc should be LOC_MMXREGISTER } if right.location.loc<>LOC_MMXREGISTER then begin if (nodetype=subn) and (nf_swapped in flags) then begin hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList); if right.location.loc=LOC_CMMXREGISTER then begin emit_reg_reg(A_MOVQ,S_NO,right.location.register,hreg); emit_reg_reg(op,S_NO,left.location.register,hreg); end else begin if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then internalerror(2002032412); tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference); emit_ref_reg(A_MOVQ,S_NO,right.location.reference,hreg); emit_reg_reg(op,S_NO,left.location.register,hreg); end; location.register:=hreg; end else begin if (right.location.loc=LOC_CMMXREGISTER) then emit_reg_reg(op,S_NO,right.location.register,left.location.register) else begin if not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then internalerror(200203246); tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference); emit_ref_reg(op,S_NO,right.location.reference,left.location.register); end; location.register:=left.location.register; end; end else begin { right.location=LOC_MMXREGISTER } if (nodetype=subn) and (nf_swapped in flags) then begin emit_reg_reg(op,S_NO,left.location.register,right.location.register); location_swap(left.location,right.location); toggleflag(nf_swapped); end else begin emit_reg_reg(op,S_NO,right.location.register,left.location.register); end; location.register:=left.location.register; end; location_freetemp(current_asmdata.CurrAsmList,right.location); if cmpop then location_freetemp(current_asmdata.CurrAsmList,left.location); end; {$endif SUPPORT_MMX} {***************************************************************************** AddFloat *****************************************************************************} procedure tx86addnode.second_addfloatsse; var op : topcg; sqr_sum : boolean; tmp : tnode; begin sqr_sum:=false; if (current_settings.fputype>=fpu_sse3) and use_vectorfpu(resultdef) and (nodetype in [addn,subn]) and (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then begin sqr_sum:=true; tmp:=tinlinenode(left).left; tinlinenode(left).left:=nil; left.free; left:=tmp; tmp:=tinlinenode(right).left; tinlinenode(right).left:=nil; right.free; right:=tmp; end; pass_left_right; { fpu operands are always in reversed order on the stack } if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then toggleflag(nf_swapped); if (nf_swapped in flags) then { can't use swapleftright if both are on the fpu stack, since then } { both are "R_ST" -> nothing would change -> manually switch } if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then emit_none(A_FXCH,S_NO) else swapleftright; case nodetype of addn : op:=OP_ADD; muln : op:=OP_MUL; subn : op:=OP_SUB; slashn : op:=OP_DIV; else internalerror(200312231); end; location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef)); if sqr_sum then begin if nf_swapped in flags then swapleftright; hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false); hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true); location:=left.location; if is_double(resultdef) then begin current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register)); current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register)); case nodetype of addn: current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register)); subn: current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register)); else internalerror(201108162); end; end else begin current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register)); { ensure that bits 64..127 contain valid values } current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register)); { the data is now in bits 0..32 and 64..95 } current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register)); case nodetype of addn: begin current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register)); end; subn: begin current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register)); end; else internalerror(201108163); end; end end { we can use only right as left operand if the operation is commutative } else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then begin location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size); cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,location.size,right.location.register,location.register,mms_movescalar); { force floating point reg. location to be written to memory, we don't force it to mm register because writing to memory allows probably shorter code because there is no direct fpu->mm register copy instruction } if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef); cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,left.location,location.register,mms_movescalar); if left.location.loc=LOC_REFERENCE then tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference); end else begin if nf_swapped in flags then swapleftright; { force floating point reg. location to be written to memory, we don't force it to mm register because writing to memory allows probably shorter code because there is no direct fpu->mm register copy instruction } if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef); location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size); cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar); if left.location.loc=LOC_REFERENCE then tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference); { force floating point reg. location to be written to memory, we don't force it to mm register because writing to memory allows probably shorter code because there is no direct fpu->mm register copy instruction } if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef); cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,location.register,mms_movescalar); if right.location.loc=LOC_REFERENCE then tg.ungetiftemp(current_asmdata.CurrAsmList,right.location.reference); end; end; procedure tx86addnode.second_addfloatavx; var op : topcg; sqr_sum : boolean; {$ifdef dummy} tmp : tnode; {$endif dummy} begin sqr_sum:=false; {$ifdef dummy} if (current_settings.fputype>=fpu_sse3) and use_vectorfpu(resultdef) and (nodetype in [addn,subn]) and (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then begin sqr_sum:=true; tmp:=tinlinenode(left).left; tinlinenode(left).left:=nil; left.free; left:=tmp; tmp:=tinlinenode(right).left; tinlinenode(right).left:=nil; right.free; right:=tmp; end; {$endif dummy} pass_left_right; { fpu operands are always in reversed order on the stack } if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then toggleflag(nf_swapped); if (nf_swapped in flags) then { can't use swapleftright if both are on the fpu stack, since then } { both are "R_ST" -> nothing would change -> manually switch } if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then emit_none(A_FXCH,S_NO) else swapleftright; case nodetype of addn : op:=OP_ADD; muln : op:=OP_MUL; subn : op:=OP_SUB; slashn : op:=OP_DIV; else internalerror(2003122303); end; location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef)); if sqr_sum then begin if nf_swapped in flags then swapleftright; hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false); hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true); location:=left.location; if is_double(resultdef) then begin current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register)); current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register)); case nodetype of addn: current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register)); subn: current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register)); else internalerror(2011081601); end; end else begin current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register)); { ensure that bits 64..127 contain valid values } current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register)); { the data is now in bits 0..32 and 64..95 } current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register)); case nodetype of addn: begin current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register)); end; subn: begin current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register)); end; else internalerror(2011081604); end; end end { left*2 ? } else if (nodetype=muln) and is_constrealnode(right) and is_number_float(trealconstnode(right).value_real) and (trealconstnode(right).value_real=2) then begin location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size); hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true); cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size, left.location.register, left.location.register, location.register, mms_movescalar); end { right*2 ? } else if (nodetype=muln) and is_constrealnode(left) and is_number_float(trealconstnode(left).value_real) and (trealconstnode(left).value_real=2) then begin location.register:=cg.getmmregister(current_asmdata.CurrAsmList,right.location.size); hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true); cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size, right.location.register, right.location.register, location.register, mms_movescalar); end { we can use only right as left operand if the operation is commutative } else if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) and (op in [OP_ADD,OP_MUL]) then begin location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size); { force floating point reg. location to be written to memory, we don't force it to mm register because writing to memory allows probably shorter code because there is no direct fpu->mm register copy instruction } if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef); cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size, left.location, right.location.register, location.register, mms_movescalar); end else begin if (nf_swapped in flags) then swapleftright; hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true); location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size); { force floating point reg. location to be written to memory, we don't force it to mm register because writing to memory allows probably shorter code because there is no direct fpu->mm register copy instruction } if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef); cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size, right.location, left.location.register, location.register, mms_movescalar); end; end; function tx86addnode.pass_1: tnode; begin { only pass_1 might set the resultdef as it could be set to nil by some previous code transformation. As we need a valid left/right.resultdef later on, ensure a valid result def is set, see also issue #40727 } if not(assigned(left.resultdef)) then typecheckpass(left); if not(assigned(right.resultdef)) then typecheckpass(right); { on x86, we do not support fpu registers, so in case of operations using the x87, it is normally useful, not to put the operands into registers which would be mm register this should be called before pass_1 so we have a proper expectloc } if ((left.resultdef.typ=floatdef) or (right.resultdef.typ=floatdef)) and (not(use_vectorfpu(left.resultdef)) and not(use_vectorfpu(right.resultdef)) and not(use_vectorfpu(resultdef))) then begin make_not_regable(left,[ra_addr_regable]); make_not_regable(right,[ra_addr_regable]); end; Result:=inherited pass_1; { correct expectloc, it does not matter of Result is set as another pass_1 is run on it which will fix that one } if use_vectorfpu(resultdef) then expectloc:=LOC_MMREGISTER; end; function tx86addnode.simplify(forinline : boolean) : tnode; var t, m, ThisNode, ConstNode: TNode; lt,rt, ThisType: TNodeType; ThisDef: TDef; DoOptimisation: Boolean; reciprocal, comparison, divisor: AWord; shift, N: Byte; begin { Load into local variables to reduce the number of pointer deallocations } rt:=right.nodetype; lt:=left.nodetype; DoOptimisation:=False; {$if defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)} if (cs_opt_level1 in current_settings.optimizerswitches) and { The presence of overflow checks tends to cause internal errors with the multiplication nodes } not (cs_check_overflow in current_settings.localswitches) and (nodetype in [equaln,unequaln]) then begin if (lt=modn) and (rt=ordconstn) and (TOrdConstNode(right).value.uvalue=0) then begin t:=left; m:=right; end else if (rt=modn) and (lt=ordconstn) and (TOrdConstNode(left).value.uvalue=0) then begin t:=right; m:=left; end else begin t:=nil; m:=nil; end; if Assigned(t) and (TModDivNode(t).right.nodetype=ordconstn) and {$ifndef cpu64bitalu} { Converting Int64 and QWord division doesn't work under i386 } {$ifndef cpu32bitalu} (TModDivNode(t).resultdef.size < 4) and {$else cpu32bitalu} (TModDivNode(t).resultdef.size < 8) and {$endif cpu32bitalu} {$endif cpu64bitalu} (TOrdConstNode(TModDivNode(t).right).value>=3) then begin divisor:=TOrdConstNode(TModDivNode(t).right).value.uvalue; { Exclude powers of 2, as there are more efficient ways to handle those } if PopCnt(divisor)>1 then begin if is_signed(TModDivNode(t).left.resultdef) then begin { See pages 250-251 of Hacker's Delight, Second Edition for an explanation and proof of the algorithm, but essentially, we're doing the following: - Convert the divisor d to the form k.2^b if it isn't already odd (in which case, k = d and b = 0) - Calculate r, the multiplicative inverse of k modulo 2^N - Calculate c = floor(2^(N-1) / k) & -(2^b) - Let q = ((n * r) + c) ror b (mod 2^N) - Repurpose c to equal floor(2c / 2^b) = c shr (b - 1) (some RISC platforms will benefit from doing this over precalculating the modified constant. For x86, it's better with the constant precalculated for 32-bit and under, but for 64-bit, use SHR. ) - If q is below or equal to c, then (n mod d) = 0 } while True do begin ThisNode:=TModDivNode(t).left; case ThisNode.nodetype of typeconvn: begin ThisDef:=TTypeConvNode(ThisNode).left.resultdef; { See if we can simplify things to a smaller ordinal to reduce code size and increase speed } if is_signed(ThisDef) and is_integer(ThisDef) and { Byte-sized multiplications can cause problems } (ThisDef.size>=2) and { Make sure the divisor is in range } (divisor>=TOrdDef(ThisDef).low) and (divisor<=TOrdDef(ThisDef).high) then begin TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef; TOrdConstNode(m).resultdef:=ThisDef; TModDivNode(t).resultdef:=ThisDef; { Destroy the typeconv node } TModDivNode(t).left:=TTypeConvNode(ThisNode).left; TTypeConvNode(ThisNode).left:=nil; ThisNode.Free; Continue; end; end; ordconstn: begin { Just simplify into a constant } Result:=inherited simplify(forinline); Exit; end; else ; end; DoOptimisation:=True; Break; end; if DoOptimisation then begin ThisDef:=TModDivNode(t).left.resultdef; if nodetype = equaln then ThisType:=lten else ThisType:=gtn; N:=ThisDef.size*8; calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift); { Construct the following node tree for odd divisors: (for equaln) or (for notequaln) For even divisors, convert them to the form k.2^b, with odd k, then construct the following: (for equaln) or (for notequaln) (b) } ThisNode:=ctypeconvnode.create_internal(TModDivNode(t).left, ThisDef); TTypeConvNode(ThisNode).convtype:=tc_int_2_int; ThisDef:=get_unsigned_inttype(ThisDef); ThisNode.resultdef:=ThisDef; TModDivNode(t).left:=nil; ConstNode:=cordconstnode.create(reciprocal, ThisDef, False); ConstNode.resultdef:=ThisDef; ThisNode:=caddnode.create_internal(muln, ThisNode, ConstNode); ThisNode.resultdef:=ThisDef; {$push} {$warnings off} if shift>0 then comparison:=((aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div (divisor shr shift)) and -(1 shl shift) else comparison:=(aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div divisor; {$pop} ConstNode:=cordconstnode.create(comparison, ThisDef, False); ConstNode.resultdef:=ThisDef; ThisNode:=caddnode.create_internal(addn, ThisNode, ConstNode); ThisNode.resultdef:=ThisDef; if shift>0 then begin ConstNode:=cordconstnode.create(shift, u8inttype, False); ConstNode.resultdef:=u8inttype; ThisNode:=cinlinenode.createintern(in_ror_x_y,false, ccallparanode.create(ConstNode, ccallparanode.create(ThisNode, nil))); ThisNode.resultdef:=ThisDef; ConstNode:=cordconstnode.create(comparison shr (shift - 1), ThisDef, False); end else ConstNode:=cordconstnode.create(comparison*2, ThisDef, False); ConstNode.resultdef:=ThisDef; Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode); Result.resultdef:=resultdef; Exit; end; end else begin { For bit length N, convert "(x mod d) = 0" or "(x mod d) <> 0", where d is an odd-numbered integer constant, to "(x * r) <= m", where dr = 1 (mod 2^N) and m = floor(2^N / d). If d is even, convert to the form k.2^b, where k is odd, then convert to "(x * r) ror b <= m", where kr = 1 (mod 2^N) and m = floor(2^N / d) = floor(2^(N-b) / k) } while True do begin ThisNode:=TModDivNode(t).left; case ThisNode.nodetype of typeconvn: begin ThisDef:=TTypeConvNode(ThisNode).left.resultdef; { See if we can simplify things to a smaller ordinal to reduce code size and increase speed } if not is_signed(ThisDef) and is_integer(ThisDef) and { Byte-sized multiplications can cause problems } (ThisDef.size>=2) and { Make sure the divisor is in range } (divisor>=TOrdDef(ThisDef).low) and (divisor<=TOrdDef(ThisDef).high) then begin TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef; TOrdConstNode(m).resultdef:=ThisDef; TModDivNode(t).resultdef:=ThisDef; { Destroy the typeconv node } TModDivNode(t).left:=TTypeConvNode(ThisNode).left; TTypeConvNode(ThisNode).left:=nil; ThisNode.Free; Continue; end; end; ordconstn: begin { Just simplify into a constant } Result:=inherited simplify(forinline); Exit; end; else ; end; DoOptimisation:=True; Break; end; if DoOptimisation then begin ThisDef:=TModDivNode(t).left.resultdef; { Construct the following node tree for odd divisors: (for equaln) or (for notequaln) (2^N / divisor) For even divisors, convert them to the form k.2^b, with odd k, then construct the following: (for equaln) or (for notequaln) (b) (2^N / divisor) } if nodetype=equaln then ThisType:=lten else ThisType:=gtn; N:=ThisDef.size*8; calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift); ConstNode:=cordconstnode.create(reciprocal, ThisDef, False); ConstNode.resultdef:=ThisDef; ThisNode:=caddnode.create_internal(muln, TModDivNode(t).left, ConstNode); ThisNode.resultdef:=ThisDef; TModDivNode(t).left:=nil; if shift>0 then begin ConstNode:=cordconstnode.create(shift, u8inttype, False); ConstNode.resultdef:=u8inttype; ThisNode:=cinlinenode.createintern(in_ror_x_y,false, ccallparanode.create(ConstNode, ccallparanode.create(ThisNode, nil))); ThisNode.resultdef:=ThisDef; comparison:=(aWord(1) shl ((N-shift) and (SizeOf(aWord)*8-1))) div (divisor shr shift); end else begin {$push} {$warnings off} { Because 2^N and divisor are relatively prime, floor(2^N / divisor) = floor((2^N - 1) / divisor) } comparison:=(aWord(not 0) shr (((SizeOf(aWord)*8)-N) and (SizeOf(aWord)*8-1))) div divisor; {$pop} end; ConstNode:=cordconstnode.create(comparison, ThisDef, False); ConstNode.resultdef:=ThisDef; Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode); Result.resultdef:=resultdef; Exit; end; end; end; end; end; {$ifend defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)} Result:=inherited simplify(forinline); end; function tx86addnode.use_fma : boolean; begin {$ifndef i8086} { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense } Result:=use_vectorfpu(resultdef) and ((fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[]); {$else i8086} Result:=inherited use_fma; {$endif i8086} end; procedure tx86addnode.second_cmpfloatvector; var op : tasmop; const ops_single: array[boolean] of tasmop = (A_COMISS,A_VCOMISS); ops_double: array[boolean] of tasmop = (A_COMISD,A_VCOMISD); begin if is_single(left.resultdef) then op:=ops_single[UseAVX] else if is_double(left.resultdef) then op:=ops_double[UseAVX] else internalerror(200402222); pass_left_right; { fpu operands are always in reversed order on the stack } if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then toggleflag(nf_swapped); location_reset(location,LOC_FLAGS,OS_NO); { Direct move fpu->mm register is not possible, so force any fpu operands to memory (not to mm registers because one of the memory locations can be used directly in compare instruction, yielding shorter code) } if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef); if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef); if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then begin case left.location.loc of LOC_REFERENCE,LOC_CREFERENCE: begin tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference); current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register)); end; LOC_MMREGISTER,LOC_CMMREGISTER: current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register)); else internalerror(200402221); end; toggleflag(nf_swapped); end else begin hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true); case right.location.loc of LOC_REFERENCE,LOC_CREFERENCE: begin tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference); current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register)); end; LOC_MMREGISTER,LOC_CMMREGISTER: current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register)); else internalerror(200402223); end; end; location.resflags:=getfpuresflags; location_freetemp(current_asmdata.CurrAsmList,left.location); location_freetemp(current_asmdata.CurrAsmList,right.location); end; procedure tx86addnode.second_opvector; var op : topcg; begin pass_left_right; if (nf_swapped in flags) then swapleftright; case nodetype of addn : op:=OP_ADD; muln : op:=OP_MUL; subn : op:=OP_SUB; slashn : op:=OP_DIV; else internalerror(200610071); end; if fits_in_mm_register(left.resultdef) then begin location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef)); { we can use only right as left operand if the operation is commutative } if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then begin if UseAVX then begin location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR); cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,right.location.register,location.register,nil); end else begin location.register:=right.location.register; cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,location.register,nil); end; end else begin location_force_mmreg(current_asmdata.CurrAsmList,left.location,false); if UseAVX then begin location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR); cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op, tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,left.location.register,location.register,nil); end else begin location.register:=left.location.register; cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op, tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,location.register,nil); end; end; end else begin { not yet supported } internalerror(200610072); end end; procedure tx86addnode.second_addfloat; const ops_add: array[boolean] of TAsmOp = (A_FADDP,A_FADD); ops_mul: array[boolean] of TAsmOp = (A_FMULP,A_FMUL); ops_sub: array[boolean] of TAsmOp = (A_FSUBP,A_FSUB); ops_rsub: array[boolean] of TAsmOp = (A_FSUBRP,A_FSUBR); ops_div: array[boolean] of TAsmOp = (A_FDIVP,A_FDIV); ops_rdiv: array[boolean] of TAsmOp = (A_FDIVRP,A_FDIVR); var op : TAsmOp; refnode, hp: tnode; hasref : boolean; begin if use_vectorfpu(resultdef) then begin if UseAVX then second_addfloatavx else second_addfloatsse; exit; end; { can the operation do the conversion? } if (left.nodetype=typeconvn) and (is_double(ttypeconvnode(left).left.resultdef) or is_single(ttypeconvnode(left).left.resultdef)) then begin hp:=left; left:=ttypeconvnode(left).left; ttypeconvnode(hp).left:=nil; hp.Free; end; if (right.nodetype=typeconvn) and (is_double(ttypeconvnode(right).left.resultdef) or is_single(ttypeconvnode(right).left.resultdef)) then begin hp:=right; right:=ttypeconvnode(right).left; ttypeconvnode(hp).left:=nil; hp.Free; end; pass_left_right; prepare_x87_locations(refnode); hasref:=assigned(refnode); case nodetype of addn : op:=ops_add[hasref]; muln : op:=ops_mul[hasref]; subn : if (nf_swapped in flags) then op:=ops_rsub[hasref] else op:=ops_sub[hasref]; slashn : if (nf_swapped in flags) then op:=ops_rdiv[hasref] else op:=ops_div[hasref]; else internalerror(2003042203); end; if hasref then emit_ref(op,tcgsize2opsize[refnode.location.size],refnode.location.reference) else begin emit_reg_reg(op,S_NO,NR_ST,NR_ST1); tcgx86(cg).dec_fpu_stack; end; location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef)); location.register:=NR_ST; end; procedure tx86addnode.second_cmpfloat; {$ifdef i8086} var tmpref: treference; {$endif i8086} begin if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then begin second_cmpfloatvector; exit; end; pass_left_right; force_left_and_right_fpureg; {$ifndef x86_64} if current_settings.cputype LOC_CONSTANT) and (right.location.loc <> LOC_CONSTANT) ) ) then begin { ANDN only supports the second operand being inverted; however, since we're dealing with ordinals, there won't be any Boolean shortcutting, so we can safely swap the parameters } if (right.nodetype <> notn) then swapleftright; secondpass(left); { Skip the not node completely } Include(right.transientflags, tnf_do_not_execute); secondpass(tnotnode(right).left); { allocate registers } hlcg.location_force_reg( current_asmdata.CurrAsmList, tnotnode(right).left.location, tnotnode(right).left.resultdef, tnotnode(right).left.resultdef, false ); if left.location.loc = LOC_CONSTANT then { With "const and (not variable)", we can probably still make a saving when it comes to pipeline stalls (left.location.loc will become LOC_CREGISTER). } hlcg.location_force_reg( current_asmdata.CurrAsmList, left.location, left.resultdef, left.resultdef, true ); set_result_location_reg; case left.location.loc of LOC_REFERENCE, LOC_CREFERENCE: emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register); LOC_REGISTER, LOC_CREGISTER: emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register); else InternalError(2022102110); end; { Overflow can't happen with and/andn } Exit; end; end; { BMI2 optimisations } if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then begin { Can we turn "x and ((1 shl y) - 1)" into a BZHI instruction instead? } if (nodetype = andn) and (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and ( ( (right.nodetype = subn) and (taddnode(right).right.nodetype = ordconstn) and (tordconstnode(taddnode(right).right).value = 1) and (taddnode(right).left.nodetype = shln) and (tshlshrnode(taddnode(right).left).left.nodetype = ordconstn) and (tordconstnode(tshlshrnode(taddnode(right).left).left).value = 1) ) or ( (left.nodetype = subn) and (taddnode(left).right.nodetype = ordconstn) and (tordconstnode(taddnode(left).right).value = 1) and (taddnode(left).left.nodetype = shln) and (tshlshrnode(taddnode(left).left).left.nodetype = ordconstn) and (tordconstnode(tshlshrnode(taddnode(left).left).left).value = 1) ) ) then begin { Put the subtract node on the right } if (right.nodetype <> subn) then swapleftright; secondpass(left); { Skip the subtract and shift nodes completely } Include(right.transientflags, tnf_do_not_execute); Include(taddnode(right).left.transientflags, tnf_do_not_execute); { Helps avoid all the awkward typecasts } indexnode := tshlshrnode(taddnode(right).left).right; {$ifdef x86_64} { The code generator sometimes extends the shift result to 64-bit unnecessarily } if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and (def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then begin { Convert to the 32-bit type } indexnode.resultdef := resultdef; node_reset_flags(indexnode,[],[tnf_pass1_done]); { We should't be getting any new errors } if do_firstpass(indexnode) then InternalError(2022110201); { Keep things internally consistent in case indexnode changed } tshlshrnode(taddnode(right).left).right := indexnode; end; {$endif x86_64} secondpass(indexnode); { allocate registers } hlcg.location_force_reg( current_asmdata.CurrAsmList, indexnode.location, indexnode.resultdef, resultdef, false ); set_result_location_reg; case left.location.loc of LOC_REFERENCE, LOC_CREFERENCE: emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.reference, location.register); LOC_REGISTER, LOC_CREGISTER: emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.register, location.register); else InternalError(2022102111); end; Exit; end; end; end; {$endif not i8086} pass_left_right; { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions make no sense if right is a reference } if ((left.location.loc<>LOC_REGISTER) and (right.location.loc<>LOC_REGISTER) and ((nodetype<>subn) or not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE])) and { 3 op mul makes only sense if a constant is involed } ((nodetype<>muln) or (left.location.loc=LOC_CONSTANT) or (right.location.loc=LOC_CONSTANT) {$ifndef i8086} or ((CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (not(needoverflowcheck)) ) {$endif i8086} ) and (not(nodetype in [orn,andn,xorn]))) or ((nodetype=addn) and (left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT]) and (right.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT])) then begin { allocate registers } force_reg_left_right(nodetype<>subn,true); set_result_location_reg; if nodetype<>subn then begin if checkoverflow then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); if (right.location.loc<>LOC_CONSTANT) then hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef, left.location.register,right.location.register, location.register,checkoverflow,ovloc) else hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef, right.location.value,left.location.register, location.register,checkoverflow,ovloc); end else { subtract is a special case since its not commutative } begin if (nf_swapped in flags) then swapleftright; if left.location.loc<>LOC_CONSTANT then begin if checkoverflow then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); if right.location.loc<>LOC_CONSTANT then hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef, right.location.register,left.location.register, location.register,checkoverflow,ovloc) else hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef, right.location.value,left.location.register, location.register,checkoverflow,ovloc); end else begin tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef); hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef, left.location.value,tmpreg); if checkoverflow then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef, right.location.register,tmpreg,location.register,checkoverflow,ovloc); end; end end else begin { at least one location should be a register, if yes, try to re-use it, so we can try two operand opcodes } if left.location.loc<>LOC_REGISTER then begin if right.location.loc<>LOC_REGISTER then hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false) else begin location_swap(left.location,right.location); toggleflag(nf_swapped); end; end; { at this point, left.location.loc should be LOC_REGISTER } if right.location.loc=LOC_REGISTER then begin if checkoverflow then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); { when swapped another result register } if (nodetype=subn) and (nf_swapped in flags) then begin cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize, left.location.register,right.location.register); location_swap(left.location,right.location); toggleflag(nf_swapped); end else cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize, right.location.register,left.location.register); end else begin { right.location<>LOC_REGISTER } if right.location.loc in [LOC_CSUBSETREF,LOC_CSUBSETREG,LOC_SUBSETREF,LOC_SUBSETREG] then hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,left.resultdef,true); if (nodetype=subn) and (nf_swapped in flags) then begin tmpreg:=left.location.register; left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize); cg.a_load_loc_reg(current_asmdata.CurrAsmList,opsize,right.location,left.location.register); if checkoverflow then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,tmpreg,left.location.register); end else begin if checkoverflow then cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); cg.a_op_loc_reg(current_asmdata.CurrAsmList,cgop,opsize,right.location,left.location.register); end; location_freetemp(current_asmdata.CurrAsmList,right.location); end; location_copy(location,left.location); end; { emit overflow check if required } if checkoverflow then cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,Location,resultdef,ovloc); end; procedure tx86addnode.second_addboolean; begin if (nodetype in [orn,andn]) and (not(cs_full_boolean_eval in current_settings.localswitches) or (anf_short_bool in addnodeflags)) then inherited second_addboolean else if is_64bit(left.resultdef) then inherited else second_addordinal; end; procedure tx86addnode.second_cmpordinal; var opdef : tdef; opsize : tcgsize; unsigned : boolean; begin unsigned:=not(is_signed(left.resultdef)) or not(is_signed(right.resultdef)); opdef:=left.resultdef; opsize:=def_cgsize(opdef); pass_left_right; if (right.location.loc=LOC_CONSTANT) and (left.location.loc in [LOC_REFERENCE, LOC_CREFERENCE]) {$ifdef x86_64} and ((not (opsize in [OS_64,OS_S64])) or ( (right.location.value>=low(longint)) and (right.location.value<=high(longint)) )) {$endif x86_64} then begin cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS); emit_const_ref(A_CMP, TCGSize2Opsize[opsize], right.location.value, left.location.reference); location_freetemp(current_asmdata.CurrAsmList,left.location); end else begin left_must_be_reg(opdef,opsize,false); emit_generic_code(A_CMP,opsize,unsigned,false,false); location_freetemp(current_asmdata.CurrAsmList,right.location); location_freetemp(current_asmdata.CurrAsmList,left.location); end; location_reset(location,LOC_FLAGS,OS_NO); location.resflags:=getresflags(unsigned); end; begin caddnode:=tx86addnode; end.