* Reworked i386 division by constant optimization to reuse code from powerpc64. The algorithm is slightly different, signed version is one instruction shorter, unsigned one is one instruction longer (typically). The new algorithm is easily scalable for x86_64 target, unlike the old one.

git-svn-id: trunk@27884 -
This commit is contained in:
sergei 2014-06-07 00:20:05 +00:00
parent 90d2009a31
commit 92cf25b9a5

View File

@ -61,16 +61,6 @@ implementation
TI386MODDIVNODE
*****************************************************************************}
function log2(i : dword) : dword;
begin
result:=0;
i:=i shr 1;
while i<>0 do
begin
i:=i shr 1;
inc(result);
end;
end;
procedure ti386moddivnode.pass_generate_code;
@ -80,8 +70,10 @@ implementation
hl:Tasmlabel;
op:Tasmop;
e : longint;
d,l,r,s,m,a,n,t : dword;
m_low,m_high,j,k : qword;
d,m: dword;
s: byte;
sm: aint;
m_add: boolean;
begin
secondpass(left);
if codegenerror then
@ -147,74 +139,28 @@ implementation
if is_signed(left.resultdef) then
begin
e:=tordconstnode(right).value.svalue;
d:=abs(e);
{ Determine algorithm (a), multiplier (m), and shift factor (s) for 32-bit
signed integer division. Based on: Granlund, T.; Montgomery, P.L.:
"Division by Invariant Integers using Multiplication". SIGPLAN Notices,
Vol. 29, June 1994, page 61.
}
l:=log2(d);
j:=qword($80000000) mod qword(d);
k:=(qword(1) shl (32+l)) div (qword($80000000-j));
m_low:=((qword(1)) shl (32+l)) div d;
m_high:=(((qword(1)) shl (32+l)) + k) div d;
while ((m_low shr 1) < (m_high shr 1)) and (l > 0) do
begin
m_low:=m_low shr 1;
m_high:=m_high shr 1;
dec(l);
end;
m:=dword(m_high);
s:=l;
if (m_high shr 31)<>0 then
a:=1
else
a:=0;
calc_divconst_magic_signed(32,e,sm,s);
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
emit_const_reg(A_MOV,S_L,sm,NR_EAX);
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
emit_reg(A_IMUL,S_L,hreg1);
emit_reg_reg(A_MOV,S_L,hreg1,NR_EAX);
if a<>0 then
begin
emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
{
printf ("; dividend: memory location or register other than EAX or EDX\n");
printf ("\n");
printf ("MOV EAX, 0%08LXh\n", m);
printf ("IMUL dividend\n");
printf ("MOV EAX, dividend\n");
printf ("ADD EDX, EAX\n");
if (s) printf ("SAR EDX, %d\n", s);
printf ("SHR EAX, 31\n");
printf ("ADD EDX, EAX\n");
if (e < 0) printf ("NEG EDX\n");
printf ("\n");
printf ("; quotient now in EDX\n");
}
end;
{
printf ("; dividend: memory location of register other than EAX or EDX\n");
printf ("\n");
printf ("MOV EAX, 0%08LXh\n", m);
printf ("IMUL dividend\n");
printf ("MOV EAX, dividend\n");
if (s) printf ("SAR EDX, %d\n", s);
printf ("SHR EAX, 31\n");
printf ("ADD EDX, EAX\n");
if (e < 0) printf ("NEG EDX\n");
printf ("\n");
printf ("; quotient now in EDX\n");
}
if s<>0 then
emit_const_reg(A_SAR,S_L,s,NR_EDX);
emit_const_reg(A_SHR,S_L,31,NR_EAX);
emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
if e<0 then
emit_reg(A_NEG,S_L,NR_EDX);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
{ only the high half of result is used }
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
{ add or subtract dividend }
if (e>0) and (sm<0) then
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX)
else if (e<0) and (sm>0) then
emit_reg_reg(A_SUB,S_L,hreg1,NR_EDX);
{ shift if necessary }
if (s<>0) then
emit_const_reg(A_SAR,S_L,s,NR_EDX);
{ extract and add the sign bit }
if (e<0) then
emit_reg_reg(A_MOV,S_L,NR_EDX,hreg1);
{ if e>=0, hreg1 still contains dividend }
emit_const_reg(A_SHR,S_L,31,hreg1);
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
end
@ -230,83 +176,22 @@ implementation
end
else
begin
{ Reduce divisor until it becomes odd }
n:=0;
t:=d;
while (t and 1)=0 do
begin
t:=t shr 1;
inc(n);
end;
{ Generate m, s for algorithm 0. Based on: Granlund, T.; Montgomery,
P.L.: "Division by Invariant Integers using Multiplication".
SIGPLAN Notices, Vol. 29, June 1994, page 61.
}
l:=log2(t)+1;
j:=qword($ffffffff) mod qword(t);
k:=(qword(1) shl (32+l)) div (qword($ffffffff-j));
m_low:=((qword(1)) shl (32+l)) div t;
m_high:=(((qword(1)) shl (32+l)) + k) div t;
while ((m_low shr 1) < (m_high shr 1)) and (l>0) do
begin
m_low:=m_low shr 1;
m_high:=m_high shr 1;
l:=l-1;
end;
if (m_high shr 32)=0 then
begin
m:=dword(m_high);
s:=l;
a:=0;
end
{ Generate m, s for algorithm 1. Based on: Magenheimer, D.J.; et al:
"Integer Multiplication and Division on the HP Precision Architecture".
IEEE Transactions on Computers, Vol 37, No. 8, August 1988, page 980.
}
else
begin
s:=log2(t);
m_low:=(qword(1) shl (32+s)) div qword(t);
r:=dword(((qword(1)) shl (32+s)) mod qword(t));
if (r < ((t>>1)+1)) then
m:=dword(m_low)
else
m:=dword(m_low)+1;
a:=1;
end;
{ Reduce multiplier for either algorithm to smallest possible }
while (m and 1)=0 do
begin
m:=m shr 1;
dec(s);
end;
{ Adjust multiplier for reduction of even divisors }
inc(s,n);
calc_divconst_magic_unsigned(32,d,m,m_add,s);
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
emit_reg(A_MUL,S_L,hreg1);
if a<>0 then
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
if m_add then
begin
{
printf ("; dividend: register other than EAX or memory location\n");
printf ("\n");
printf ("MOV EAX, 0%08lXh\n", m);
printf ("MUL dividend\n");
printf ("ADD EAX, 0%08lXh\n", m);
printf ("ADC EDX, 0\n");
if (s) printf ("SHR EDX, %d\n", s);
printf ("\n");
printf ("; quotient now in EDX\n");
}
emit_const_reg(A_ADD,S_L,aint(m),NR_EAX);
emit_const_reg(A_ADC,S_L,0,NR_EDX);
emit_reg_reg(A_SUB,S_L,NR_EDX,hreg1);
emit_const_reg(A_SHR,S_L,1,hreg1);
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX);
dec(s);
end;
if s<>0 then
emit_const_reg(A_SHR,S_L,aint(s),NR_EDX);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
end;