Add a bound parameter to node_count(_weighted).

2025-08-25 00:11:20 +02:00 · 2022-01-12 11:00:51 +03:00 · 2022-01-12 11:00:51 +03:00 · 11d16be702
commit 11d16be702
parent 2d1ab3410d
4 changed files with 91 additions and 43 deletions
--- a/compiler/ncal.pas
+++ b/compiler/ncal.pas
@ -88,6 +88,7 @@ interface
          procedure add_done_statement(n:tnode);
          procedure convert_carg_array_of_const;
          procedure order_parameters;
          function heuristics_favors_inlining:boolean;
          procedure check_inlining;
          function  pass1_normal:tnode;
          procedure register_created_object_types;
@ -4753,6 +4754,30 @@ implementation
      end;
    function tcallnode.heuristics_favors_inlining:boolean;
      var
        limExcluding: cardinal;
      begin
        {  Prevent too deep inlining recursion and code bloat by inlining
           The actual formuala is
                             inlinelevel/3+1    /-------
               node count <  -----------------\/  10000
           This allows exponential grow of the code only to a certain limit.
           Remarks
            - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined
              if the max. complexity is reached. This is done because it makes the implementation easier and because
              there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes
              if the outer nodes are in a seldomly used code path
            - The code avoids to use functions from the math unit
        }
        limExcluding:=round(exp((1.0/(inlinelevel/3.0+1))*ln(10000)));
        result:=node_count(tprocdef(procdefinition).inlininginfo^.code,limExcluding)<limExcluding;
      end;
    procedure tcallnode.check_inlining;
      var
        st   : tsymtable;
@ -4762,22 +4787,7 @@ implementation
        if (po_inline in procdefinition.procoptions) and
           (procdefinition.typ=procdef) and
           tprocdef(procdefinition).has_inlininginfo and
-           {  Prevent too deep inlining recursion and code bloat by inlining
+           heuristics_favors_inlining then
              The actual formuala is
                                inlinelevel/3+1    /-------
                  node count <  -----------------\/  10000
              This allows exponential grow of the code only to a certain limit.
              Remarks
               - The current approach calculates the inlining level top down, so outer call nodes (nodes closer to the leaf) might not be inlined
                 if the max. complexity is reached. This is done because it makes the implementation easier and because
                 there might be situations were it is more beneficial to inline inner nodes and do the calls to the outer nodes
                 if the outer nodes are in a seldomly used code path
               - The code avoids to use functions from the math unit
           }
           (node_count(tprocdef(procdefinition).inlininginfo^.code)<round(exp((1.0/(inlinelevel/3.0+1))*ln(10000)))) then
          begin
            include(callnodeflags,cnf_do_inline);
            { Check if we can inline the procedure when it references proc/var that
--- a/compiler/nutils.pas
+++ b/compiler/nutils.pas
@ -134,10 +134,11 @@ interface
    function has_conditional_nodes(n : tnode) : boolean;
    { count the number of nodes in the node tree,
-      rough estimation how large the tree "node" is }
+      rough estimation how large the tree "node" is
-    function node_count(node : tnode) : dword;
+      If more than max nodes, returns max, so node_count(n, 10 + 1) <= 10 answers whether the tree has ≤10 nodes but avoids traversing the remaining 990. }
    function node_count(node : tnode; max : dword = High(dword)) : dword;
-    function node_count_weighted(node : tnode) : dword;
+    function node_count_weighted(node : tnode; max : dword = High(dword)) : dword;
    { returns true, if the value described by node is constant/immutable, this approximation is safe
      if no dirty tricks like buffer overflows or pointer magic are used }
@ -1438,37 +1439,49 @@ implementation
        result:=foreachnodestatic(n,@check_for_conditional_nodes,nil);
      end;
    var
      nodecount : dword;
    function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
      begin
-        inc(nodecount);
+        if PDWord(arg)^>0 then
-        result:=fen_false;
+          begin
            dec(PDWord(arg)^);
            result:=fen_false;
          end
        else
          result:=fen_norecurse_false;
      end;
-    function node_count(node : tnode) : dword;
+    function node_count(node : tnode; max : dword = High(dword)) : dword;
      var
        left : dword;
      begin
-        nodecount:=0;
+        left:=max;
-        foreachnodestatic(node,@donodecount,nil);
+        foreachnodestatic(node,@donodecount,@left);
-        result:=nodecount;
+        result:=max-left;
      end;
    function donodecount_weighted(var n: tnode; arg: pointer): foreachnoderesult;
      begin
-        if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then
+        if PDWord(arg)^>0 then
-          inc(nodecount);
+          begin
-        result:=fen_false;
+            if not(n.nodetype in [blockn,statementn,callparan,nothingn]) then
              dec(PDWord(arg)^);
            result:=fen_false;
          end
        else
          result:=fen_norecurse_false;
      end;
-    function node_count_weighted(node : tnode) : dword;
+    function node_count_weighted(node : tnode; max : dword = High(dword)) : dword;
      var
        left : dword;
      begin
-        nodecount:=0;
+        left:=max;
-        foreachnodestatic(node,@donodecount_weighted,nil);
+        foreachnodestatic(node,@donodecount_weighted,@left);
-        result:=nodecount;
+        result:=max-left;
      end;
--- a/compiler/optloop.pas
+++ b/compiler/optloop.pas
@ -52,6 +52,8 @@ unit optloop;
      procinfo;
    function number_unrolls(node : tnode) : cardinal;
      var
        nodeCount : cardinal;
      begin
        { calculate how often a loop shall be unrolled.
@ -60,10 +62,22 @@ unit optloop;
 {$ifdef i386}
        { multiply by 2 for CPUs with a long pipeline }
        if current_settings.optimizecputype in [cpu_Pentium4] then
-          number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)))
+          begin
            { See the common branch below for an explanation. }
            nodeCount:=node_count_weighted(node,41);
            number_unrolls:=round((60+(60*ord(nodeCount<15)))/max(nodeCount,1))
          end
        else
 {$endif i386}
-          number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)));
+          begin
            { If nodeCount >= 15, numerator will be 30,
              and the largest number (starting from 15) that makes sense as its denominator
              (the smallest number that gives number_unrolls = 1) is 21 = trunc(30/1.5+1),
              so there's no point in counting for more than 21 nodes.
              "Long pipeline" variant above is the same with numerator=60 and max denominator = 41. }
            nodeCount:=node_count_weighted(node,21);
            number_unrolls:=round((30+(60*ord(nodeCount<15)))/max(nodeCount,1));
          end;
        if number_unrolls=0 then
          number_unrolls:=1;
--- a/compiler/psub.pas
+++ b/compiler/psub.pas
@ -1829,6 +1829,23 @@ implementation
             end;
         end;
       function heuristics_favors_autoinlining(code: tnode): boolean;
         var
           complexityAvail : integer;
         begin
           { rough approximation if we should auto inline:
             - if the tree is simple enough
             - if the tree is not too big
             A bigger tree which is simpler might be autoinlined otoh
             a smaller and complexer tree as well: so we use the sum of
             both measures here }
           { This is a shortcutted version of
             "result:=node_count(code)+node_complexity(code)<=25". }
           complexityAvail:=25-node_complexity(code);
           result:=(complexityAvail>0) and (node_count(code,complexityAvail+1)<=dword(complexityAvail));
         end;
      var
        old_current_procinfo : tprocinfo;
        oldmaxfpuregisters : longint;
@ -1911,13 +1928,7 @@ implementation
                                           potype_destructor,potype_class_constructor,potype_class_destructor]) and
            ((procdef.procoptions*[po_exports,po_external,po_interrupt,po_virtualmethod,po_iocheck])=[]) and
            (not(procdef.proccalloption in [pocall_safecall])) and
-            { rough approximation if we should auto inline:
+            heuristics_favors_autoinlining(code) then
              - if the tree is simple enough
              - if the tree is not too big
              A bigger tree which is simpler might be autoinlined otoh
              a smaller and complexer tree as well: so we use the sum of
              both measures here }
            (node_count(code)+node_complexity(code)<=25) then
          begin
            { Can we inline this procedure? }
            if checknodeinlining(procdef) then