From 11d16be702862041c9793a389bbf5c9ce179bda8 Mon Sep 17 00:00:00 2001
From: Rika Ichinose <rrunewalsh@gmail.com>
Date: Wed, 12 Jan 2022 11:00:51 +0300
Subject: [PATCH] Add a bound parameter to node_count(_weighted).

---
 compiler/ncal.pas    | 42 ++++++++++++++++++++++---------------
 compiler/nutils.pas  | 49 ++++++++++++++++++++++++++++----------------
 compiler/optloop.pas | 18 ++++++++++++++--
 compiler/psub.pas    | 25 +++++++++++++++-------
 4 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/compiler/ncal.pas b/compiler/ncal.pas
index 8f4ce6b3af..f6548d0e24 100644
--- a/compiler/ncal.pas
+++ b/compiler/ncal.pas
@@ -88,6 +88,7 @@ interface
           procedure add_done_statement(n:tnode);
           procedure convert_carg_array_of_const;
           procedure order_parameters;
+          function heuristics_favors_inlining:boolean;
           procedure check_inlining;
           function  pass1_normal:tnode;
           procedure register_created_object_types;
@@ -4753,6 +4754,30 @@ implementation
       end;
 
 
+    function tcallnode.heuristics_favors_inlining:boolean;
+      var
+        limExcluding: cardinal;
+      begin
+        {  Prevent too deep inlining recursion and code bloat by inlining
+
+           The actual formuala is
+                             inlinelevel/3+1    /-------
+               node count <  -----------------\/  10000
+
+           This allows exponential grow of the code only to a certain limit.
+
+           Remarks
+            - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined
+              if the max. complexity is reached. This is done because it makes the implementation easier and because
+              there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes
+              if the outer nodes are in a seldomly used code path
+            - The code avoids to use functions from the math unit
+        }
+        limExcluding:=round(exp((1.0/(inlinelevel/3.0+1))*ln(10000)));
+        result:=node_count(tprocdef(procdefinition).inlininginfo^.code,limExcluding)<limExcluding;
+      end;
+
+
     procedure tcallnode.check_inlining;
       var
         st   : tsymtable;
@@ -4762,22 +4787,7 @@ implementation
         if (po_inline in procdefinition.procoptions) and
            (procdefinition.typ=procdef) and
            tprocdef(procdefinition).has_inlininginfo and
-           {  Prevent too deep inlining recursion and code bloat by inlining
-
-              The actual formuala is
-                                inlinelevel/3+1    /-------
-                  node count <  -----------------\/  10000
-
-              This allows exponential grow of the code only to a certain limit.
-
-              Remarks
-               - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined
-                 if the max. complexity is reached. This is done because it makes the implementation easier and because
-                 there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes
-                 if the outer nodes are in a seldomly used code path
-               - The code avoids to use functions from the math unit
-           }
-           (node_count(tprocdef(procdefinition).inlininginfo^.code)<round(exp((1.0/(inlinelevel/3.0+1))*ln(10000)))) then
+           heuristics_favors_inlining then
           begin
             include(callnodeflags,cnf_do_inline);
             { Check if we can inline the procedure when it references proc/var that
diff --git a/compiler/nutils.pas b/compiler/nutils.pas
index ad1bbc747e..83f4a46811 100644
--- a/compiler/nutils.pas
+++ b/compiler/nutils.pas
@@ -134,10 +134,11 @@ interface
     function has_conditional_nodes(n : tnode) : boolean;
 
     { count the number of nodes in the node tree,
-      rough estimation how large the tree "node" is }
-    function node_count(node : tnode) : dword;
+      rough estimation how large the tree "node" is
+      If more than max nodes, returns max, so node_count(n, 10 + 1) <= 10 answers whether the tree has ≤10 nodes but avoids traversing the remaining 990. }
+    function node_count(node : tnode; max : dword = High(dword)) : dword;
 
-    function node_count_weighted(node : tnode) : dword;
+    function node_count_weighted(node : tnode; max : dword = High(dword)) : dword;
 
     { returns true, if the value described by node is constant/immutable, this approximation is safe
       if no dirty tricks like buffer overflows or pointer magic are used }
@@ -1438,37 +1439,49 @@ implementation
         result:=foreachnodestatic(n,@check_for_conditional_nodes,nil);
       end;
 
-    var
-      nodecount : dword;
 
     function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
       begin
-        inc(nodecount);
-        result:=fen_false;
+        if PDWord(arg)^>0 then
+          begin
+            dec(PDWord(arg)^);
+            result:=fen_false;
+          end
+        else
+          result:=fen_norecurse_false;
       end;
 
 
-    function node_count(node : tnode) : dword;
+    function node_count(node : tnode; max : dword = High(dword)) : dword;
+      var
+        left : dword;
       begin
-        nodecount:=0;
-        foreachnodestatic(node,@donodecount,nil);
-        result:=nodecount;
+        left:=max;
+        foreachnodestatic(node,@donodecount,@left);
+        result:=max-left;
       end;
 
 
     function donodecount_weighted(var n: tnode; arg: pointer): foreachnoderesult;
       begin
-        if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then
-          inc(nodecount);
-        result:=fen_false;
+        if PDWord(arg)^>0 then
+          begin
+            if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then
+              dec(PDWord(arg)^);
+            result:=fen_false;
+          end
+        else
+          result:=fen_norecurse_false;
       end;
 
 
-    function node_count_weighted(node : tnode) : dword;
+    function node_count_weighted(node : tnode; max : dword = High(dword)) : dword;
+      var
+        left : dword;
       begin
-        nodecount:=0;
-        foreachnodestatic(node,@donodecount_weighted,nil);
-        result:=nodecount;
+        left:=max;
+        foreachnodestatic(node,@donodecount_weighted,@left);
+        result:=max-left;
       end;
 
 
diff --git a/compiler/optloop.pas b/compiler/optloop.pas
index 6f2dfce690..c66829d2b0 100644
--- a/compiler/optloop.pas
+++ b/compiler/optloop.pas
@@ -52,6 +52,8 @@ unit optloop;
       procinfo;
 
     function number_unrolls(node : tnode) : cardinal;
+      var
+        nodeCount : cardinal;
       begin
         { calculate how often a loop shall be unrolled.
 
@@ -60,10 +62,22 @@ unit optloop;
 {$ifdef i386}
         { multiply by 2 for CPUs with a long pipeline }
         if current_settings.optimizecputype in [cpu_Pentium4] then
-          number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)))
+          begin
+            { See the common branch below for an explanation. }
+            nodeCount:=node_count_weighted(node,41);
+            number_unrolls:=round((60+(60*ord(nodeCount<15)))/max(nodeCount,1))
+          end
         else
 {$endif i386}
-          number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)));
+          begin
+            { If nodeCount >= 15, numerator will be 30,
+              and the largest number (starting from 15) that makes sense as its denominator
+              (the smallest number that gives number_unrolls = 1) is 21 = trunc(30/1.5+1),
+              so there's no point in counting for more than 21 nodes.
+              "Long pipeline" variant above is the same with numerator=60 and max denominator = 41. }
+            nodeCount:=node_count_weighted(node,21);
+            number_unrolls:=round((30+(60*ord(nodeCount<15)))/max(nodeCount,1));
+          end;
 
         if number_unrolls=0 then
           number_unrolls:=1;
diff --git a/compiler/psub.pas b/compiler/psub.pas
index 06bca7299d..70316dacb2 100644
--- a/compiler/psub.pas
+++ b/compiler/psub.pas
@@ -1829,6 +1829,23 @@ implementation
              end;
          end;
 
+       function heuristics_favors_autoinlining(code: tnode): boolean;
+         var
+           complexityAvail : integer;
+         begin
+           { rough approximation if we should auto inline:
+             - if the tree is simple enough
+             - if the tree is not too big
+             A bigger tree which is simpler might be autoinlined otoh
+             a smaller and complexer tree as well: so we use the sum of
+             both measures here }
+
+           { This is a shortcutted version of
+             "result:=node_count(code)+node_complexity(code)<=25". }
+           complexityAvail:=25-node_complexity(code);
+           result:=(complexityAvail>0) and (node_count(code,complexityAvail+1)<=dword(complexityAvail));
+         end;
+
       var
         old_current_procinfo : tprocinfo;
         oldmaxfpuregisters : longint;
@@ -1911,13 +1928,7 @@ implementation
                                            potype_destructor,potype_class_constructor,potype_class_destructor]) and
             ((procdef.procoptions*[po_exports,po_external,po_interrupt,po_virtualmethod,po_iocheck])=[]) and
             (not(procdef.proccalloption in [pocall_safecall])) and
-            { rough approximation if we should auto inline:
-              - if the tree is simple enough
-              - if the tree is not too big
-              A bigger tree which is simpler might be autoinlined otoh
-              a smaller and complexer tree as well: so we use the sum of
-              both measures here }
-            (node_count(code)+node_complexity(code)<=25) then
+            heuristics_favors_autoinlining(code) then
           begin
             { Can we inline this procedure? }
             if checknodeinlining(procdef) then