Add a bound parameter to node_count(_weighted).

2025-04-06 21:07:58 +02:00 · 2022-01-12 11:00:51 +03:00 · 2022-01-12 11:00:51 +03:00 · 11d16be702
commit 11d16be702
parent 2d1ab3410d
4 changed files with 91 additions and 43 deletions
--- a/compiler/ncal.pas
+++ b/compiler/ncal.pas
@ -88,6 +88,7 @@ interface
          procedure add_done_statement(n:tnode);
          procedure convert_carg_array_of_const;
          procedure order_parameters;
+          function heuristics_favors_inlining:boolean;
          procedure check_inlining;
          function  pass1_normal:tnode;
          procedure register_created_object_types;
@ -4753,6 +4754,30 @@ implementation
      end;


+    function tcallnode.heuristics_favors_inlining:boolean;
+      var
+        limExcluding: cardinal;
+      begin
+        {  Prevent too deep inlining recursion and code bloat by inlining
+
+           The actual formuala is
+                             inlinelevel/3+1    /-------
+               node count <  -----------------\/  10000
+
+           This allows exponential grow of the code only to a certain limit.
+
+           Remarks
+            - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined
+              if the max. complexity is reached. This is done because it makes the implementation easier and because
+              there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes
+              if the outer nodes are in a seldomly used code path
+            - The code avoids to use functions from the math unit
+        }
+        limExcluding:=round(exp((1.0/(inlinelevel/3.0+1))*ln(10000)));
+        result:=node_count(tprocdef(procdefinition).inlininginfo^.code,limExcluding)<limExcluding;
+      end;
+
+
    procedure tcallnode.check_inlining;
      var
        st   : tsymtable;
@ -4762,22 +4787,7 @@ implementation
        if (po_inline in procdefinition.procoptions) and
           (procdefinition.typ=procdef) and
           tprocdef(procdefinition).has_inlininginfo and
-           {  Prevent too deep inlining recursion and code bloat by inlining
-
-              The actual formuala is
-                                inlinelevel/3+1    /-------
-                  node count <  -----------------\/  10000
-
-              This allows exponential grow of the code only to a certain limit.
-
-              Remarks
-               - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined
-                 if the max. complexity is reached. This is done because it makes the implementation easier and because
-                 there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes
-                 if the outer nodes are in a seldomly used code path
-               - The code avoids to use functions from the math unit
-           }
-           (node_count(tprocdef(procdefinition).inlininginfo^.code)<round(exp((1.0/(inlinelevel/3.0+1))*ln(10000)))) then
+           heuristics_favors_inlining then
          begin
            include(callnodeflags,cnf_do_inline);
            { Check if we can inline the procedure when it references proc/var that
--- a/compiler/nutils.pas
+++ b/compiler/nutils.pas
@ -134,10 +134,11 @@ interface
    function has_conditional_nodes(n : tnode) : boolean;

    { count the number of nodes in the node tree,
-      rough estimation how large the tree "node" is }
-    function node_count(node : tnode) : dword;
+      rough estimation how large the tree "node" is
+      If more than max nodes, returns max, so node_count(n, 10 + 1) <= 10 answers whether the tree has ≤10 nodes but avoids traversing the remaining 990. }
+    function node_count(node : tnode; max : dword = High(dword)) : dword;

-    function node_count_weighted(node : tnode) : dword;
+    function node_count_weighted(node : tnode; max : dword = High(dword)) : dword;

    { returns true, if the value described by node is constant/immutable, this approximation is safe
      if no dirty tricks like buffer overflows or pointer magic are used }
@ -1438,37 +1439,49 @@ implementation
        result:=foreachnodestatic(n,@check_for_conditional_nodes,nil);
      end;

-    var
-      nodecount : dword;

    function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
      begin
-        inc(nodecount);
-        result:=fen_false;
+        if PDWord(arg)^>0 then
+          begin
+            dec(PDWord(arg)^);
+            result:=fen_false;
+          end
+        else
+          result:=fen_norecurse_false;
      end;


-    function node_count(node : tnode) : dword;
+    function node_count(node : tnode; max : dword = High(dword)) : dword;
+      var
+        left : dword;
      begin
-        nodecount:=0;
-        foreachnodestatic(node,@donodecount,nil);
-        result:=nodecount;
+        left:=max;
+        foreachnodestatic(node,@donodecount,@left);
+        result:=max-left;
      end;


    function donodecount_weighted(var n: tnode; arg: pointer): foreachnoderesult;
      begin
-        if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then
-          inc(nodecount);
-        result:=fen_false;
+        if PDWord(arg)^>0 then
+          begin
+            if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then
+              dec(PDWord(arg)^);
+            result:=fen_false;
+          end
+        else
+          result:=fen_norecurse_false;
      end;


-    function node_count_weighted(node : tnode) : dword;
+    function node_count_weighted(node : tnode; max : dword = High(dword)) : dword;
+      var
+        left : dword;
      begin
-        nodecount:=0;
-        foreachnodestatic(node,@donodecount_weighted,nil);
-        result:=nodecount;
+        left:=max;
+        foreachnodestatic(node,@donodecount_weighted,@left);
+        result:=max-left;
      end;


--- a/compiler/optloop.pas
+++ b/compiler/optloop.pas
@ -52,6 +52,8 @@ unit optloop;
      procinfo;

    function number_unrolls(node : tnode) : cardinal;
+      var
+        nodeCount : cardinal;
      begin
        { calculate how often a loop shall be unrolled.

@ -60,10 +62,22 @@ unit optloop;
 {$ifdef i386}
        { multiply by 2 for CPUs with a long pipeline }
        if current_settings.optimizecputype in [cpu_Pentium4] then
-          number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)))
+          begin
+            { See the common branch below for an explanation. }
+            nodeCount:=node_count_weighted(node,41);
+            number_unrolls:=round((60+(60*ord(nodeCount<15)))/max(nodeCount,1))
+          end
        else
 {$endif i386}
-          number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)));
+          begin
+            { If nodeCount >= 15, numerator will be 30,
+              and the largest number (starting from 15) that makes sense as its denominator
+              (the smallest number that gives number_unrolls = 1) is 21 = trunc(30/1.5+1),
+              so there's no point in counting for more than 21 nodes.
+              "Long pipeline" variant above is the same with numerator=60 and max denominator = 41. }
+            nodeCount:=node_count_weighted(node,21);
+            number_unrolls:=round((30+(60*ord(nodeCount<15)))/max(nodeCount,1));
+          end;

        if number_unrolls=0 then
          number_unrolls:=1;
--- a/compiler/psub.pas
+++ b/compiler/psub.pas
@ -1829,6 +1829,23 @@ implementation
             end;
         end;

+       function heuristics_favors_autoinlining(code: tnode): boolean;
+         var
+           complexityAvail : integer;
+         begin
+           { rough approximation if we should auto inline:
+             - if the tree is simple enough
+             - if the tree is not too big
+             A bigger tree which is simpler might be autoinlined otoh
+             a smaller and complexer tree as well: so we use the sum of
+             both measures here }
+
+           { This is a shortcutted version of
+             "result:=node_count(code)+node_complexity(code)<=25". }
+           complexityAvail:=25-node_complexity(code);
+           result:=(complexityAvail>0) and (node_count(code,complexityAvail+1)<=dword(complexityAvail));
+         end;
+
      var
        old_current_procinfo : tprocinfo;
        oldmaxfpuregisters : longint;
@ -1911,13 +1928,7 @@ implementation
                                           potype_destructor,potype_class_constructor,potype_class_destructor]) and
            ((procdef.procoptions*[po_exports,po_external,po_interrupt,po_virtualmethod,po_iocheck])=[]) and
            (not(procdef.proccalloption in [pocall_safecall])) and
-            { rough approximation if we should auto inline:
-              - if the tree is simple enough
-              - if the tree is not too big
-              A bigger tree which is simpler might be autoinlined otoh
-              a smaller and complexer tree as well: so we use the sum of
-              both measures here }
-            (node_count(code)+node_complexity(code)<=25) then
+            heuristics_favors_autoinlining(code) then
          begin
            { Can we inline this procedure? }
            if checknodeinlining(procdef) then