From 11d16be702862041c9793a389bbf5c9ce179bda8 Mon Sep 17 00:00:00 2001 From: Rika Ichinose Date: Wed, 12 Jan 2022 11:00:51 +0300 Subject: [PATCH] Add a bound parameter to node_count(_weighted). --- compiler/ncal.pas | 42 ++++++++++++++++++++++--------------- compiler/nutils.pas | 49 ++++++++++++++++++++++++++++---------------- compiler/optloop.pas | 18 ++++++++++++++-- compiler/psub.pas | 25 +++++++++++++++------- 4 files changed, 91 insertions(+), 43 deletions(-) diff --git a/compiler/ncal.pas b/compiler/ncal.pas index 8f4ce6b3af..f6548d0e24 100644 --- a/compiler/ncal.pas +++ b/compiler/ncal.pas @@ -88,6 +88,7 @@ interface procedure add_done_statement(n:tnode); procedure convert_carg_array_of_const; procedure order_parameters; + function heuristics_favors_inlining:boolean; procedure check_inlining; function pass1_normal:tnode; procedure register_created_object_types; @@ -4753,6 +4754,30 @@ implementation end; + function tcallnode.heuristics_favors_inlining:boolean; + var + limExcluding: cardinal; + begin + { Prevent too deep inlining recursion and code bloat by inlining + + The actual formuala is + inlinelevel/3+1 /------- + node count < -----------------\/ 10000 + + This allows exponential grow of the code only to a certain limit. + + Remarks + - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined + if the max. complexity is reached. This is done because it makes the implementation easier and because + there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes + if the outer nodes are in a seldomly used code path + - The code avoids to use functions from the math unit + } + limExcluding:=round(exp((1.0/(inlinelevel/3.0+1))*ln(10000))); + result:=node_count(tprocdef(procdefinition).inlininginfo^.code,limExcluding)0 then + begin + dec(PDWord(arg)^); + result:=fen_false; + end + else + result:=fen_norecurse_false; end; - function node_count(node : tnode) : dword; + function node_count(node : tnode; max : dword = High(dword)) : dword; + var + left : dword; begin - nodecount:=0; - foreachnodestatic(node,@donodecount,nil); - result:=nodecount; + left:=max; + foreachnodestatic(node,@donodecount,@left); + result:=max-left; end; function donodecount_weighted(var n: tnode; arg: pointer): foreachnoderesult; begin - if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then - inc(nodecount); - result:=fen_false; + if PDWord(arg)^>0 then + begin + if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then + dec(PDWord(arg)^); + result:=fen_false; + end + else + result:=fen_norecurse_false; end; - function node_count_weighted(node : tnode) : dword; + function node_count_weighted(node : tnode; max : dword = High(dword)) : dword; + var + left : dword; begin - nodecount:=0; - foreachnodestatic(node,@donodecount_weighted,nil); - result:=nodecount; + left:=max; + foreachnodestatic(node,@donodecount_weighted,@left); + result:=max-left; end; diff --git a/compiler/optloop.pas b/compiler/optloop.pas index 6f2dfce690..c66829d2b0 100644 --- a/compiler/optloop.pas +++ b/compiler/optloop.pas @@ -52,6 +52,8 @@ unit optloop; procinfo; function number_unrolls(node : tnode) : cardinal; + var + nodeCount : cardinal; begin { calculate how often a loop shall be unrolled. @@ -60,10 +62,22 @@ unit optloop; {$ifdef i386} { multiply by 2 for CPUs with a long pipeline } if current_settings.optimizecputype in [cpu_Pentium4] then - number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1))) + begin + { See the common branch below for an explanation. } + nodeCount:=node_count_weighted(node,41); + number_unrolls:=round((60+(60*ord(nodeCount<15)))/max(nodeCount,1)) + end else {$endif i386} - number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1))); + begin + { If nodeCount >= 15, numerator will be 30, + and the largest number (starting from 15) that makes sense as its denominator + (the smallest number that gives number_unrolls = 1) is 21 = trunc(30/1.5+1), + so there's no point in counting for more than 21 nodes. + "Long pipeline" variant above is the same with numerator=60 and max denominator = 41. } + nodeCount:=node_count_weighted(node,21); + number_unrolls:=round((30+(60*ord(nodeCount<15)))/max(nodeCount,1)); + end; if number_unrolls=0 then number_unrolls:=1; diff --git a/compiler/psub.pas b/compiler/psub.pas index 06bca7299d..70316dacb2 100644 --- a/compiler/psub.pas +++ b/compiler/psub.pas @@ -1829,6 +1829,23 @@ implementation end; end; + function heuristics_favors_autoinlining(code: tnode): boolean; + var + complexityAvail : integer; + begin + { rough approximation if we should auto inline: + - if the tree is simple enough + - if the tree is not too big + A bigger tree which is simpler might be autoinlined otoh + a smaller and complexer tree as well: so we use the sum of + both measures here } + + { This is a shortcutted version of + "result:=node_count(code)+node_complexity(code)<=25". } + complexityAvail:=25-node_complexity(code); + result:=(complexityAvail>0) and (node_count(code,complexityAvail+1)<=dword(complexityAvail)); + end; + var old_current_procinfo : tprocinfo; oldmaxfpuregisters : longint; @@ -1911,13 +1928,7 @@ implementation potype_destructor,potype_class_constructor,potype_class_destructor]) and ((procdef.procoptions*[po_exports,po_external,po_interrupt,po_virtualmethod,po_iocheck])=[]) and (not(procdef.proccalloption in [pocall_safecall])) and - { rough approximation if we should auto inline: - - if the tree is simple enough - - if the tree is not too big - A bigger tree which is simpler might be autoinlined otoh - a smaller and complexer tree as well: so we use the sum of - both measures here } - (node_count(code)+node_complexity(code)<=25) then + heuristics_favors_autoinlining(code) then begin { Can we inline this procedure? } if checknodeinlining(procdef) then