* (modified) patch by Gareth Moreton: Speed improvement in case blocks, resolves #0034762

This patch improves the compiler where "case" statements are concerned, using jump tables more often and creating more efficient machine code in some situations:
  * If a case block only contains one branch (not including the else block), the initial range check is removed, since this becomes wasted effort.
  * If the else block is empty, the else label is set to the end label - though this doesn't decrease the code size, it takes a bit of strain off the peephole optimizer.
  * On -O2 and above, some node analysis is now done on the branch labels. Most of the time this just redirects it to the end
    label for empty blocks, but if the block contains a goto statement, it will redirect it to its destination instead,
    thus increasing performance by not having multiple jumps (this won't get picked up by the peephole optimiser if the label addresses are in a jump table).
  * Some checks now use what I call the 'true count' rather than the 'label count'. The true count includes each
    individual value in a range - for example, 0..2 counts as 3. This increases the chance that a jump table will be
    utilised in situations where it is more efficient than a linear list.
  * For jump tables, if the case block almost covers the entire range (32 entries or fewer from full coverage),
    the initial range check is removed and the gaps included in the jump table (pointing to the else label).

git-svn-id: trunk@40676 -
This commit is contained in:
florian 2018-12-27 18:31:55 +00:00
parent 75ab803528
commit 7f5c2fa3aa
6 changed files with 2349 additions and 63 deletions

1
.gitattributes vendored
View File

@ -10718,6 +10718,7 @@ tests/Makefile.fpc svneol=native#text/plain
tests/bench/bansi1.inc svneol=native#text/plain
tests/bench/bansi1.pp svneol=native#text/plain
tests/bench/bansi1mt.pp svneol=native#text/plain
tests/bench/bcase.pp -text svneol=native#text/pascal
tests/bench/blists1.inc svneol=native#text/plain
tests/bench/blists1.pp svneol=native#text/plain
tests/bench/bmd5.pp svneol=native#text/plain

View File

@ -73,6 +73,13 @@ interface
jumptable_no_range : boolean;
{ has the implementation jumptable support }
min_label : tconstexprint;
{ Number of labels }
labelcnt: TCgInt;
{ Number of individual values checked, counting each value in a range
individually (e.g. 0..2 counts as 3). }
TrueCount: TCgInt;
function GetBranchLabel(Block: TNode; out _Label: TAsmLabel): Boolean;
function blocklabel(id:longint):tasmlabel;
procedure optimizevalues(var max_linear_list:aint;var max_dist:aword);virtual;
@ -90,9 +97,10 @@ implementation
uses
verbose,
symconst,symdef,defutil,
cutils,
symconst,symdef,symsym,defutil,
pass_2,tgobj,
ncon,
nbas,ncon,ncgflw,
ncgutil,hlcgobj;
@ -524,6 +532,79 @@ implementation
TCGCASENODE
*****************************************************************************}
{ Analyse the nodes following the else label - if empty, change to end label }
function tcgcasenode.GetBranchLabel(Block: TNode; out _Label: TAsmLabel): Boolean;
var
LabelSym: TLabelSym;
begin
Result := True;
if not Assigned(Block) then
begin
{ Block doesn't exist / is empty }
_Label := endlabel;
Exit;
end;
{ These optimisations aren't particularly debugger friendly }
if not (cs_opt_level2 in current_settings.optimizerswitches) then
begin
Result := False;
current_asmdata.getjumplabel(_Label);
Exit;
end;
while Assigned(Block) do
begin
case Block.nodetype of
nothingn:
begin
_Label := endlabel;
Exit;
end;
goton:
begin
LabelSym := TCGGotoNode(Block).labelsym;
if not Assigned(LabelSym) then
InternalError(2018121131);
_Label := TCGLabelNode(TCGGotoNode(Block).labelnode).getasmlabel;
if Assigned(_Label) then
{ Keep tabs on the fact that an actual 'goto' was used }
Include(flowcontrol,fc_gotolabel)
else
Break;
Exit;
end;
blockn:
begin
Block := TBlockNode(Block).Left;
Continue;
end;
statementn:
begin
{ If the right node is assigned, then it's a compound block
that can't be simplified, so fall through, set Result to
False and make a new label }
if Assigned(TStatementNode(Block).right) then
Break;
Block := TStatementNode(Block).Left;
Continue;
end;
end;
Break;
end;
{ Create unique label }
Result := False;
current_asmdata.getjumplabel(_Label);
end;
function tcgcasenode.blocklabel(id:longint):tasmlabel;
begin
if not assigned(blocks[id]) then
@ -642,7 +723,7 @@ implementation
opsize:=newdef;
end;
last:=0;
first:=true;
first:=(labelcnt > 1); { Can greatly simplify the range checks if there's only one label }
scratch_reg:=hlcg.getintregister(current_asmdata.CurrAsmList,opsize);
genitem(hp);
hlcg.a_jmp_always(current_asmdata.CurrAsmList,elselabel);
@ -1043,25 +1124,43 @@ implementation
end;
procedure tcgcasenode.pass_generate_code;
{ Combines "case_count_labels" and "case_true_count" }
procedure CountBoth(p : pcaselabel);
begin
Inc(labelcnt);
Inc(TrueCount, (p^._high.svalue - p^._low.svalue) + 1);
if assigned(p^.less) then
CountBoth(p^.less);
if assigned(p^.greater) then
CountBoth(p^.greater);
end;
var
oldflowcontrol: tflowcontrol;
i : longint;
dist,distv,
dist : aword;
distv,
lv,hv,
max_label: tconstexprint;
labelcnt : tcgint;
max_linear_list : aint;
max_dist : aword;
ShortcutElse: Boolean;
begin
location_reset(location,LOC_VOID,OS_NO);
oldflowcontrol := flowcontrol;
include(flowcontrol,fc_inflowcontrol);
{ Allocate labels }
current_asmdata.getjumplabel(endlabel);
current_asmdata.getjumplabel(elselabel);
{ Do some optimisation to deal with empty else blocks }
ShortcutElse := GetBranchLabel(elseblock, elselabel);
for i:=0 to blocks.count-1 do
current_asmdata.getjumplabel(pcaseblock(blocks[i])^.blocklabel);
with pcaseblock(blocks[i])^ do
shortcut := GetBranchLabel(statement, blocklabel);
with_sign:=is_signed(left.resultdef);
if with_sign then
@ -1118,8 +1217,13 @@ implementation
{ moreover can the size only be appro- }
{ ximated as it is not known if rel8, }
{ rel16 or rel32 jumps are used }
max_label:=case_get_max(labels);
labelcnt:=case_count_labels(labels);
labelcnt := 0;
TrueCount := 0;
CountBoth(labels);
max_label := case_get_max(labels);
{ can we omit the range check of the jump table ? }
getrange(left.resultdef,lv,hv);
jumptable_no_range:=(lv=min_label) and (hv=max_label);
@ -1128,7 +1232,7 @@ implementation
if distv>=0 then
dist:=distv.uvalue
else
dist:=-distv.svalue;
dist:=aword(-distv.svalue);
{ optimize for size ? }
if cs_opt_size in current_settings.optimizerswitches then
@ -1137,8 +1241,8 @@ implementation
(min_label>=int64(low(aint))) and
(max_label<=high(aint)) and
not((labelcnt<=2) or
((max_label-min_label)<0) or
((max_label-min_label)>3*labelcnt)) then
(distv.svalue<0) or
(dist>3*TrueCount)) then
begin
{ if the labels less or more a continuum then }
genjumptable(labels,min_label.svalue,max_label.svalue);
@ -1151,7 +1255,12 @@ implementation
end
else
begin
max_dist:=4*labelcnt;
max_dist:=4*TrueCount;
{ Don't allow jump tables to get too large }
if max_dist>4*labelcnt then
max_dist:=min(max_dist,2048);
if jumptable_no_range then
max_linear_list:=4
else
@ -1187,26 +1296,37 @@ implementation
end;
{ generate the instruction blocks }
for i:=0 to blocks.count-1 do
for i:=0 to blocks.count-1 do with pcaseblock(blocks[i])^ do
begin
current_asmdata.CurrAsmList.concat(cai_align.create(current_settings.alignment.jumpalign));
cg.a_label(current_asmdata.CurrAsmList,pcaseblock(blocks[i])^.blocklabel);
secondpass(pcaseblock(blocks[i])^.statement);
{ don't come back to case line }
current_filepos:=current_asmdata.CurrAsmList.getlasttaifilepos^;
{ If the labels are not equal, then the block label has been shortcut to point elsewhere,
so there's no need to implement it }
if not shortcut then
begin
current_asmdata.CurrAsmList.concat(cai_align.create(current_settings.alignment.jumpalign));
cg.a_label(current_asmdata.CurrAsmList,blocklabel);
secondpass(statement);
{ don't come back to case line }
current_filepos:=current_asmdata.CurrAsmList.getlasttaifilepos^;
{$ifdef OLDREGVARS}
load_all_regvars(current_asmdata.CurrAsmList);
load_all_regvars(current_asmdata.CurrAsmList);
{$endif OLDREGVARS}
hlcg.a_jmp_always(current_asmdata.CurrAsmList,endlabel);
hlcg.a_jmp_always(current_asmdata.CurrAsmList,endlabel);
end;
end;
current_asmdata.CurrAsmList.concat(cai_align.create(current_settings.alignment.jumpalign));
{ ...and the else block }
hlcg.a_label(current_asmdata.CurrAsmList,elselabel);
if assigned(elseblock) then
if not ShortcutElse then
begin
secondpass(elseblock);
current_asmdata.CurrAsmList.concat(cai_align.create(current_settings.alignment.jumpalign));
hlcg.a_label(current_asmdata.CurrAsmList,elselabel);
end;
if Assigned(elseblock) then
begin
secondpass(elseblock);
{$ifdef OLDREGVARS}
load_all_regvars(current_asmdata.CurrAsmList);
load_all_regvars(current_asmdata.CurrAsmList);
{$endif OLDREGVARS}
end;

View File

@ -62,6 +62,12 @@ interface
{ label (only used in pass_generate_code) }
blocklabel : tasmlabel;
{ shortcut - set to true if blocklabel isn't actually unique to the
case block due to one of the following conditions:
- if the node contains a jump, then the label is set to that jump's destination,
- if the node is empty, the label is set to the end label. }
shortcut: Boolean;
statementlabel : tlabelnode;
{ instructions }
statement : tnode;
@ -121,6 +127,9 @@ interface
{ counts the labels }
function case_count_labels(root : pcaselabel) : longint;
{ Returns the true count in a case block, which includes each individual
value in a range (e.g. "0..2" counts as 3) }
function case_true_count(root : pcaselabel) : longint;
{ searches the highest label }
function case_get_max(root : pcaselabel) : tconstexprint;
{ searches the lowest label }
@ -439,6 +448,29 @@ implementation
end;
{ Returns the true count in a case block, which includes each individual
value in a range (e.g. "0..2" counts as 3) }
function case_true_count(root : pcaselabel) : longint;
var
_l : longint;
procedure count(p : pcaselabel);
begin
inc(_l, (p^._high.svalue - p^._low.svalue) + 1);
if assigned(p^.less) then
count(p^.less);
if assigned(p^.greater) then
count(p^.greater);
end;
begin
_l:=0;
count(root);
case_true_count:=_l;
end;
function case_get_max(root : pcaselabel) : tconstexprint;
var
hp : pcaselabel;

View File

@ -47,7 +47,7 @@ implementation
uses
systems,
verbose,globals,
symconst,symdef,defutil,
symconst,symdef,defutil,cutils,
aasmbase,aasmtai,aasmdata,aasmcpu,
cgbase,pass_2,tgobj,
ncon,
@ -76,6 +76,12 @@ implementation
opcgsize: tcgsize;
jumpreg: tregister;
labeltyp: taiconst_type;
AlmostExhaustive: Boolean;
lv, hv: TConstExprInt;
ExhaustiveLimit, Range, x, oldmin : aint;
const
ExhaustiveLimitBase = 32;
procedure genitem(list:TAsmList;t : pcaselabel);
var
@ -83,6 +89,7 @@ implementation
begin
if assigned(t^.less) then
genitem(list,t^.less);
{ fill possible hole }
i:=last.svalue+1;
while i<=t^._low.svalue-1 do
@ -102,20 +109,51 @@ implementation
end;
begin
lv:=0;
hv:=0;
oldmin:=0;
last:=min_;
{ This generates near pointers on i8086 }
labeltyp:=aitconst_ptr;
opcgsize:=def_cgsize(opsize);
AlmostExhaustive := False;
if not(jumptable_no_range) then
begin
{ a <= x <= b <-> unsigned(x-a) <= (b-a) }
cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SUB,opcgsize,aint(min_),hregister);
{ case expr greater than max_ => goto elselabel }
cg.a_cmp_const_reg_label(current_asmdata.CurrAsmList,opcgsize,OC_A,aint(max_)-aint(min_),hregister,elselabel);
min_:=0;
{ do not sign extend when we load the index register, as we applied an offset above }
opcgsize:=tcgsize2unsigned[opcgsize];
getrange(left.resultdef,lv,hv);
Range := aint(max_)-aint(min_);
if (cs_opt_size in current_settings.optimizerswitches) then
{ Limit size of jump tables for small enumerations so they have
to be at least two-thirds full before being considered for the
"almost exhaustive" treatment }
ExhaustiveLimit := min(ExhaustiveLimitBase, TrueCount shl 1)
else
ExhaustiveLimit := ExhaustiveLimitBase;
{ If true, then this indicates that almost every possible value of x is covered by
a label. As such, it's more cost-efficient to remove the initial range check and
instead insert the remaining values into the jump table, pointing at elselabel. [Kit] }
if ((hv - lv) - Range <= ExhaustiveLimit) then
begin
oldmin := min_;
min_ := lv.svalue;
AlmostExhaustive := True;
end
else
begin
{ a <= x <= b <-> unsigned(x-a) <= (b-a) }
cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SUB,opcgsize,aint(min_),hregister);
{ case expr greater than max_ => goto elselabel }
cg.a_cmp_const_reg_label(current_asmdata.CurrAsmList,opcgsize,OC_A,aint(max_)-aint(min_),hregister,elselabel);
min_:=0;
{ do not sign extend when we load the index register, as we applied an offset above }
opcgsize:=tcgsize2unsigned[opcgsize];
end;
end;
current_asmdata.getglobaldatalabel(table);
{ make it a 32bit register }
indexreg:=cg.makeregsize(current_asmdata.CurrAsmList,hregister,OS_INT);
@ -148,7 +186,31 @@ implementation
jtlist:=current_procinfo.aktlocaldata;
new_section(jtlist,sec_rodata,current_procinfo.procdef.mangledname,sizeof(aint));
jtlist.concat(Tai_label.Create(table));
genitem(jtlist,hp);
if AlmostExhaustive then
begin
{ Fill the table with the values below _min }
x := lv.svalue;
while x < oldmin do
begin
jtlist.concat(Tai_const.Create_type_sym(labeltyp, elselabel));
Inc(x);
end;
genitem(jtlist,hp);
{ Fill the table with the values above _max }
{ Subtracting one from hv and not adding 1 to max averts the risk of an overflow }
x := max_;
hv := hv - 1;
while x <= hv.svalue do
begin
jtlist.concat(Tai_const.Create_type_sym(labeltyp, elselabel));
Inc(x);
end;
end
else
genitem(jtlist,hp)
end;
@ -251,7 +313,7 @@ implementation
begin
last:=0;
lastrange:=false;
first:=true;
first:=(labelcnt > 1); { Can greatly simplify the range checks if there's only one label }
genitem(hp);
cg.a_jmp_always(current_asmdata.CurrAsmList,elselabel);
end;

View File

@ -26,6 +26,7 @@ unit nx64set;
interface
uses
constexp,
globtype,
nset,nx86set;
@ -39,13 +40,13 @@ interface
implementation
uses
systems,
verbose,globals,constexp,
defutil,
aasmbase,aasmtai,aasmdata,
systems,cpuinfo,
verbose,globals,
defutil,cutils,
aasmbase,aasmtai,aasmdata,aasmcpu,
cgbase,
cpubase,procinfo,
cga,cgutils,cgobj;
cga,cgutils,cgobj,cgx86;
{*****************************************************************************
@ -66,73 +67,111 @@ implementation
tablelabel: TAsmLabel;
basereg,indexreg,jumpreg: TRegister;
href: TReference;
jtlist: TAsmList;
opcgsize: tcgsize;
sectype: TAsmSectiontype;
jtitemconsttype: taiconst_type;
AlmostExhaustive: Boolean;
lv, hv: TConstExprInt;
ExhaustiveLimit, Range, x, oldmin : aint;
procedure genitem(list:TAsmList;t : pcaselabel);
const
ExhaustiveLimitBase = 32;
procedure genitem(t : pcaselabel);
var
i : aint;
begin
if assigned(t^.less) then
genitem(list,t^.less);
genitem(t^.less);
{ fill possible hole }
i:=last.svalue+1;
while i<=t^._low.svalue-1 do
begin
list.concat(Tai_const.Create_rel_sym(jtitemconsttype,tablelabel,elselabel));
jtlist.concat(Tai_const.Create_rel_sym(jtitemconsttype,tablelabel,elselabel));
inc(i);
end;
i:=t^._low.svalue;
while i<=t^._high.svalue do
begin
list.concat(Tai_const.Create_rel_sym(jtitemconsttype,tablelabel,blocklabel(t^.blockid)));
jtlist.concat(Tai_const.Create_rel_sym(jtitemconsttype,tablelabel,blocklabel(t^.blockid)));
inc(i);
end;
last:=t^._high;
if assigned(t^.greater) then
genitem(list,t^.greater);
genitem(t^.greater);
end;
begin
lv:=0;
hv:=0;
if not(target_info.system in systems_darwin) then
jtitemconsttype:=aitconst_32bit
else
{ see https://gmplib.org/list-archives/gmp-bugs/2012-December/002836.html }
jtitemconsttype:=aitconst_darwin_dwarf_delta32;
jtlist := current_asmdata.CurrAsmList;
last:=min_;
opcgsize:=def_cgsize(opsize);
AlmostExhaustive := False;
oldmin := min_;
if not(jumptable_no_range) then
begin
{ a <= x <= b <-> unsigned(x-a) <= (b-a) }
cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SUB,opcgsize,aint(min_),hregister);
{ case expr greater than max_ => goto elselabel }
cg.a_cmp_const_reg_label(current_asmdata.CurrAsmList,opcgsize,OC_A,aint(max_)-aint(min_),hregister,elselabel);
min_:=0;
{ do not sign extend when we load the index register, as we applied an offset above }
opcgsize:=tcgsize2unsigned[opcgsize];
getrange(left.resultdef,lv,hv);
Range := aint(max_)-aint(min_);
if (cs_opt_size in current_settings.optimizerswitches) then
{ Limit size of jump tables for small enumerations so they have
to be at least two-thirds full before being considered for the
"almost exhaustive" treatment }
ExhaustiveLimit := min(ExhaustiveLimitBase, TrueCount shl 1)
else
ExhaustiveLimit := ExhaustiveLimitBase;
{ If true, then this indicates that almost every possible value of x is covered by
a label. As such, it's more cost-efficient to remove the initial range check and
instead insert the remaining values into the jump table, pointing at elselabel. [Kit] }
if ((hv - lv) - Range <= ExhaustiveLimit) then
begin
oldmin := min_;
min_ := lv.svalue;
AlmostExhaustive := True;
end
else
begin
{ a <= x <= b <-> unsigned(x-a) <= (b-a) }
cg.a_op_const_reg(jtlist,OP_SUB,opcgsize,aint(min_),hregister);
{ case expr greater than max_ => goto elselabel }
cg.a_cmp_const_reg_label(jtlist,opcgsize,OC_A,Range,hregister,elselabel);
min_:=0;
{ do not sign extend when we load the index register, as we applied an offset above }
opcgsize:=tcgsize2unsigned[opcgsize];
end;
end;
{ local label in order to avoid using GOT }
current_asmdata.getlabel(tablelabel,alt_data);
indexreg:=cg.makeregsize(current_asmdata.CurrAsmList,hregister,OS_ADDR);
cg.a_load_reg_reg(current_asmdata.CurrAsmList,opcgsize,OS_ADDR,hregister,indexreg);
indexreg:=cg.makeregsize(jtlist,hregister,OS_ADDR);
cg.a_load_reg_reg(jtlist,opcgsize,OS_ADDR,hregister,indexreg);
{ load table address }
reference_reset_symbol(href,tablelabel,0,4,[]);
basereg:=cg.getaddressregister(current_asmdata.CurrAsmList);
cg.a_loadaddr_ref_reg(current_asmdata.CurrAsmList,href,basereg);
basereg:=cg.getaddressregister(jtlist);
cg.a_loadaddr_ref_reg(jtlist,href,basereg);
{ load table slot, 32-bit sign extended }
reference_reset_base(href,basereg,-aint(min_)*4,ctempposinvalid,4,[]);
href.index:=indexreg;
href.scalefactor:=4;
jumpreg:=cg.getaddressregister(current_asmdata.CurrAsmList);
cg.a_load_ref_reg(current_asmdata.CurrAsmList,OS_S32,OS_ADDR,href,jumpreg);
jumpreg:=cg.getaddressregister(jtlist);
cg.a_load_ref_reg(jtlist,OS_S32,OS_ADDR,href,jumpreg);
{ add table address }
reference_reset_base(href,basereg,0,ctempposinvalid,sizeof(pint),[]);
href.index:=jumpreg;
href.scalefactor:=1;
cg.a_loadaddr_ref_reg(current_asmdata.CurrAsmList,href,jumpreg);
cg.a_loadaddr_ref_reg(jtlist,href,jumpreg);
{ and finally jump }
emit_reg(A_JMP,S_NO,jumpreg);
{ generate jump table }
@ -151,9 +190,36 @@ implementation
is inserted right after the routine, it will become part of the
same subsection that contains the routine's code }
sectype:=sec_code;
new_section(current_procinfo.aktlocaldata,sectype,current_procinfo.procdef.mangledname,4);
current_procinfo.aktlocaldata.concat(Tai_label.Create(tablelabel));
genitem(current_procinfo.aktlocaldata,hp);
jtlist := current_procinfo.aktlocaldata;
new_section(jtlist,sectype,current_procinfo.procdef.mangledname,4);
jtlist.concat(Tai_label.Create(tablelabel));
if AlmostExhaustive then
begin
{ Fill the table with the values below _min }
x := lv.svalue;
while x < oldmin do
begin
jtlist.concat(Tai_const.Create_rel_sym(jtitemconsttype,tablelabel,elselabel));
Inc(x);
end;
genitem(hp);
{ Fill the table with the values above _max }
{ Subtracting one from hv and not adding 1 to max_ averts the risk of an overflow }
x := max_;
hv := hv - 1;
while x <= hv.svalue do
begin
jtlist.concat(Tai_const.Create_rel_sym(jtitemconsttype,tablelabel,elselabel));
Inc(x);
end;
end
else
genitem(hp);
end;
begin

2005
tests/bench/bcase.pp Normal file

File diff suppressed because it is too large Load Diff