* Reworked i386 division by constant optimization to reuse code from powerpc64. The algorithm is slightly different, signed version is one instruction shorter, unsigned one is one instruction longer (typically). The new algorithm is easily scalable for x86_64 target, unlike the old one.

git-svn-id: trunk@27884 -
2025-12-04 04:37:19 +01:00 · 2014-06-07 00:20:05 +00:00 · 2014-06-07 00:20:05 +00:00 · 92cf25b9a5
commit 92cf25b9a5
parent 90d2009a31
1 changed files with 29 additions and 144 deletions
--- a/compiler/i386/n386mat.pas
+++ b/compiler/i386/n386mat.pas
@ -61,16 +61,6 @@ implementation
                             TI386MODDIVNODE
 *****************************************************************************}

-    function log2(i : dword) : dword;
-      begin
-        result:=0;
-        i:=i shr 1;
-        while i<>0 do
-          begin
-            i:=i shr 1;
-            inc(result);
-          end;
-      end;


   procedure ti386moddivnode.pass_generate_code;
@ -80,8 +70,10 @@ implementation
        hl:Tasmlabel;
        op:Tasmop;
        e : longint;
-        d,l,r,s,m,a,n,t : dword;
-        m_low,m_high,j,k : qword;
+        d,m: dword;
+        s: byte;
+        sm: aint;
+        m_add: boolean;
      begin
        secondpass(left);
        if codegenerror then
@ -147,74 +139,28 @@ implementation
                if is_signed(left.resultdef) then
                  begin
                    e:=tordconstnode(right).value.svalue;
-                    d:=abs(e);
-                    { Determine algorithm (a), multiplier (m), and shift factor (s) for 32-bit
-                      signed integer division. Based on: Granlund, T.; Montgomery, P.L.:
-                      "Division by Invariant Integers using Multiplication". SIGPLAN Notices,
-                      Vol. 29, June 1994, page 61.
-                    }
-
-                    l:=log2(d);
-                    j:=qword($80000000) mod qword(d);
-                    k:=(qword(1) shl (32+l)) div (qword($80000000-j));
-                    m_low:=((qword(1)) shl (32+l)) div d;
-                    m_high:=(((qword(1)) shl (32+l)) + k) div d;
-                    while ((m_low shr 1) < (m_high shr 1)) and (l > 0) do
-                      begin
-                        m_low:=m_low shr 1;
-                        m_high:=m_high shr 1;
-                        dec(l);
-                      end;
-                    m:=dword(m_high);
-                    s:=l;
-                    if (m_high shr 31)<>0 then
-                      a:=1
-                    else
-                      a:=0;
+                    calc_divconst_magic_signed(32,e,sm,s);
                    cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-                    emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
+                    emit_const_reg(A_MOV,S_L,sm,NR_EAX);
                    cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
                    emit_reg(A_IMUL,S_L,hreg1);
-                    emit_reg_reg(A_MOV,S_L,hreg1,NR_EAX);
-                    if a<>0 then
-                      begin
-                        emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
-                        {
-                          printf ("; dividend: memory location or register other than EAX or EDX\n");
-                          printf ("\n");
-                          printf ("MOV EAX, 0%08LXh\n", m);
-                          printf ("IMUL dividend\n");
-                          printf ("MOV EAX, dividend\n");
-                          printf ("ADD EDX, EAX\n");
-                          if (s) printf ("SAR EDX, %d\n", s);
-                          printf ("SHR EAX, 31\n");
-                          printf ("ADD EDX, EAX\n");
-                          if (e < 0) printf ("NEG EDX\n");
-                          printf ("\n");
-                          printf ("; quotient now in EDX\n");
-                        }
-                      end;
-                      {
-                        printf ("; dividend: memory location of register other than EAX or EDX\n");
-                        printf ("\n");
-                        printf ("MOV EAX, 0%08LXh\n", m);
-                        printf ("IMUL dividend\n");
-                        printf ("MOV EAX, dividend\n");
-                        if (s) printf ("SAR EDX, %d\n", s);
-                        printf ("SHR EAX, 31\n");
-                        printf ("ADD EDX, EAX\n");
-                        if (e < 0) printf ("NEG EDX\n");
-                        printf ("\n");
-                        printf ("; quotient now in EDX\n");
-                      }
-                    if s<>0 then
-                      emit_const_reg(A_SAR,S_L,s,NR_EDX);
-                    emit_const_reg(A_SHR,S_L,31,NR_EAX);
-                    emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
-                    if e<0 then
-                      emit_reg(A_NEG,S_L,NR_EDX);
-                    cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
+                    { only the high half of result is used }
                    cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
+                    { add or subtract dividend }
+                    if (e>0) and (sm<0) then
+                      emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX)
+                    else if (e<0) and (sm>0) then
+                      emit_reg_reg(A_SUB,S_L,hreg1,NR_EDX);
+                    { shift if necessary }
+                    if (s<>0) then
+                      emit_const_reg(A_SAR,S_L,s,NR_EDX);
+                    { extract and add the sign bit }
+                    if (e<0) then
+                      emit_reg_reg(A_MOV,S_L,NR_EDX,hreg1);
+                    { if e>=0, hreg1 still contains dividend }
+                    emit_const_reg(A_SHR,S_L,31,hreg1);
+                    emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX);
+                    cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
                    location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
                    cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
                  end
@ -230,83 +176,22 @@ implementation
                      end
                    else
                      begin
-                        { Reduce divisor until it becomes odd }
-                        n:=0;
-                        t:=d;
-                        while (t and 1)=0 do
-                          begin
-                            t:=t shr 1;
-                            inc(n);
-                          end;
-                        { Generate m, s for algorithm 0. Based on: Granlund, T.; Montgomery,
-                        P.L.: "Division by Invariant Integers using Multiplication".
-                        SIGPLAN Notices, Vol. 29, June 1994, page 61.
-                        }
-                        l:=log2(t)+1;
-                        j:=qword($ffffffff) mod qword(t);
-                        k:=(qword(1) shl (32+l)) div (qword($ffffffff-j));
-                        m_low:=((qword(1)) shl (32+l)) div t;
-                        m_high:=(((qword(1)) shl (32+l)) + k) div t;
-                        while ((m_low shr 1) < (m_high shr 1)) and (l>0) do
-                          begin
-                            m_low:=m_low shr 1;
-                            m_high:=m_high shr 1;
-                            l:=l-1;
-                          end;
-                        if (m_high shr 32)=0 then
-                          begin
-                            m:=dword(m_high);
-                            s:=l;
-                            a:=0;
-                          end
-
-                        { Generate m, s for algorithm 1. Based on: Magenheimer, D.J.; et al:
-                        "Integer Multiplication and Division on the HP Precision Architecture".
-                        IEEE Transactions on Computers, Vol 37, No. 8, August 1988, page 980.
-                        }
-                        else
-                          begin
-                            s:=log2(t);
-                            m_low:=(qword(1) shl (32+s)) div qword(t);
-                            r:=dword(((qword(1)) shl (32+s)) mod qword(t));
-                            if (r < ((t>>1)+1)) then
-                              m:=dword(m_low)
-                            else
-                              m:=dword(m_low)+1;
-                            a:=1;
-                          end;
-                        { Reduce multiplier for either algorithm to smallest possible }
-                        while (m and 1)=0 do
-                          begin
-                            m:=m shr 1;
-                            dec(s);
-                          end;
-                        { Adjust multiplier for reduction of even divisors }
-                        inc(s,n);
+                        calc_divconst_magic_unsigned(32,d,m,m_add,s);
                        cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
                        emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
                        cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
                        emit_reg(A_MUL,S_L,hreg1);
-                        if a<>0 then
+                        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
+                        if m_add then
                          begin
-                            {
-                            printf ("; dividend: register other than EAX or memory location\n");
-                            printf ("\n");
-                            printf ("MOV EAX, 0%08lXh\n", m);
-                            printf ("MUL dividend\n");
-                            printf ("ADD EAX, 0%08lXh\n", m);
-                            printf ("ADC EDX, 0\n");
-                            if (s) printf ("SHR EDX, %d\n", s);
-                            printf ("\n");
-                            printf ("; quotient now in EDX\n");
-                            }
-                            emit_const_reg(A_ADD,S_L,aint(m),NR_EAX);
-                            emit_const_reg(A_ADC,S_L,0,NR_EDX);
+                            emit_reg_reg(A_SUB,S_L,NR_EDX,hreg1);
+                            emit_const_reg(A_SHR,S_L,1,hreg1);
+                            emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX);
+                            dec(s);
                          end;
                        if s<>0 then
                          emit_const_reg(A_SHR,S_L,aint(s),NR_EDX);
                        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-                        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
                        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
                        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
                      end;