mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-12-04 04:37:19 +01:00
* Reworked i386 division by constant optimization to reuse code from powerpc64. The algorithm is slightly different, signed version is one instruction shorter, unsigned one is one instruction longer (typically). The new algorithm is easily scalable for x86_64 target, unlike the old one.
git-svn-id: trunk@27884 -
This commit is contained in:
parent
90d2009a31
commit
92cf25b9a5
@ -61,16 +61,6 @@ implementation
|
||||
TI386MODDIVNODE
|
||||
*****************************************************************************}
|
||||
|
||||
function log2(i : dword) : dword;
|
||||
begin
|
||||
result:=0;
|
||||
i:=i shr 1;
|
||||
while i<>0 do
|
||||
begin
|
||||
i:=i shr 1;
|
||||
inc(result);
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
procedure ti386moddivnode.pass_generate_code;
|
||||
@ -80,8 +70,10 @@ implementation
|
||||
hl:Tasmlabel;
|
||||
op:Tasmop;
|
||||
e : longint;
|
||||
d,l,r,s,m,a,n,t : dword;
|
||||
m_low,m_high,j,k : qword;
|
||||
d,m: dword;
|
||||
s: byte;
|
||||
sm: aint;
|
||||
m_add: boolean;
|
||||
begin
|
||||
secondpass(left);
|
||||
if codegenerror then
|
||||
@ -147,74 +139,28 @@ implementation
|
||||
if is_signed(left.resultdef) then
|
||||
begin
|
||||
e:=tordconstnode(right).value.svalue;
|
||||
d:=abs(e);
|
||||
{ Determine algorithm (a), multiplier (m), and shift factor (s) for 32-bit
|
||||
signed integer division. Based on: Granlund, T.; Montgomery, P.L.:
|
||||
"Division by Invariant Integers using Multiplication". SIGPLAN Notices,
|
||||
Vol. 29, June 1994, page 61.
|
||||
}
|
||||
|
||||
l:=log2(d);
|
||||
j:=qword($80000000) mod qword(d);
|
||||
k:=(qword(1) shl (32+l)) div (qword($80000000-j));
|
||||
m_low:=((qword(1)) shl (32+l)) div d;
|
||||
m_high:=(((qword(1)) shl (32+l)) + k) div d;
|
||||
while ((m_low shr 1) < (m_high shr 1)) and (l > 0) do
|
||||
begin
|
||||
m_low:=m_low shr 1;
|
||||
m_high:=m_high shr 1;
|
||||
dec(l);
|
||||
end;
|
||||
m:=dword(m_high);
|
||||
s:=l;
|
||||
if (m_high shr 31)<>0 then
|
||||
a:=1
|
||||
else
|
||||
a:=0;
|
||||
calc_divconst_magic_signed(32,e,sm,s);
|
||||
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
|
||||
emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
|
||||
emit_const_reg(A_MOV,S_L,sm,NR_EAX);
|
||||
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
|
||||
emit_reg(A_IMUL,S_L,hreg1);
|
||||
emit_reg_reg(A_MOV,S_L,hreg1,NR_EAX);
|
||||
if a<>0 then
|
||||
begin
|
||||
emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
|
||||
{
|
||||
printf ("; dividend: memory location or register other than EAX or EDX\n");
|
||||
printf ("\n");
|
||||
printf ("MOV EAX, 0%08LXh\n", m);
|
||||
printf ("IMUL dividend\n");
|
||||
printf ("MOV EAX, dividend\n");
|
||||
printf ("ADD EDX, EAX\n");
|
||||
if (s) printf ("SAR EDX, %d\n", s);
|
||||
printf ("SHR EAX, 31\n");
|
||||
printf ("ADD EDX, EAX\n");
|
||||
if (e < 0) printf ("NEG EDX\n");
|
||||
printf ("\n");
|
||||
printf ("; quotient now in EDX\n");
|
||||
}
|
||||
end;
|
||||
{
|
||||
printf ("; dividend: memory location of register other than EAX or EDX\n");
|
||||
printf ("\n");
|
||||
printf ("MOV EAX, 0%08LXh\n", m);
|
||||
printf ("IMUL dividend\n");
|
||||
printf ("MOV EAX, dividend\n");
|
||||
if (s) printf ("SAR EDX, %d\n", s);
|
||||
printf ("SHR EAX, 31\n");
|
||||
printf ("ADD EDX, EAX\n");
|
||||
if (e < 0) printf ("NEG EDX\n");
|
||||
printf ("\n");
|
||||
printf ("; quotient now in EDX\n");
|
||||
}
|
||||
if s<>0 then
|
||||
emit_const_reg(A_SAR,S_L,s,NR_EDX);
|
||||
emit_const_reg(A_SHR,S_L,31,NR_EAX);
|
||||
emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
|
||||
if e<0 then
|
||||
emit_reg(A_NEG,S_L,NR_EDX);
|
||||
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
|
||||
{ only the high half of result is used }
|
||||
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
|
||||
{ add or subtract dividend }
|
||||
if (e>0) and (sm<0) then
|
||||
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX)
|
||||
else if (e<0) and (sm>0) then
|
||||
emit_reg_reg(A_SUB,S_L,hreg1,NR_EDX);
|
||||
{ shift if necessary }
|
||||
if (s<>0) then
|
||||
emit_const_reg(A_SAR,S_L,s,NR_EDX);
|
||||
{ extract and add the sign bit }
|
||||
if (e<0) then
|
||||
emit_reg_reg(A_MOV,S_L,NR_EDX,hreg1);
|
||||
{ if e>=0, hreg1 still contains dividend }
|
||||
emit_const_reg(A_SHR,S_L,31,hreg1);
|
||||
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX);
|
||||
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
|
||||
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
|
||||
cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
|
||||
end
|
||||
@ -230,83 +176,22 @@ implementation
|
||||
end
|
||||
else
|
||||
begin
|
||||
{ Reduce divisor until it becomes odd }
|
||||
n:=0;
|
||||
t:=d;
|
||||
while (t and 1)=0 do
|
||||
begin
|
||||
t:=t shr 1;
|
||||
inc(n);
|
||||
end;
|
||||
{ Generate m, s for algorithm 0. Based on: Granlund, T.; Montgomery,
|
||||
P.L.: "Division by Invariant Integers using Multiplication".
|
||||
SIGPLAN Notices, Vol. 29, June 1994, page 61.
|
||||
}
|
||||
l:=log2(t)+1;
|
||||
j:=qword($ffffffff) mod qword(t);
|
||||
k:=(qword(1) shl (32+l)) div (qword($ffffffff-j));
|
||||
m_low:=((qword(1)) shl (32+l)) div t;
|
||||
m_high:=(((qword(1)) shl (32+l)) + k) div t;
|
||||
while ((m_low shr 1) < (m_high shr 1)) and (l>0) do
|
||||
begin
|
||||
m_low:=m_low shr 1;
|
||||
m_high:=m_high shr 1;
|
||||
l:=l-1;
|
||||
end;
|
||||
if (m_high shr 32)=0 then
|
||||
begin
|
||||
m:=dword(m_high);
|
||||
s:=l;
|
||||
a:=0;
|
||||
end
|
||||
|
||||
{ Generate m, s for algorithm 1. Based on: Magenheimer, D.J.; et al:
|
||||
"Integer Multiplication and Division on the HP Precision Architecture".
|
||||
IEEE Transactions on Computers, Vol 37, No. 8, August 1988, page 980.
|
||||
}
|
||||
else
|
||||
begin
|
||||
s:=log2(t);
|
||||
m_low:=(qword(1) shl (32+s)) div qword(t);
|
||||
r:=dword(((qword(1)) shl (32+s)) mod qword(t));
|
||||
if (r < ((t>>1)+1)) then
|
||||
m:=dword(m_low)
|
||||
else
|
||||
m:=dword(m_low)+1;
|
||||
a:=1;
|
||||
end;
|
||||
{ Reduce multiplier for either algorithm to smallest possible }
|
||||
while (m and 1)=0 do
|
||||
begin
|
||||
m:=m shr 1;
|
||||
dec(s);
|
||||
end;
|
||||
{ Adjust multiplier for reduction of even divisors }
|
||||
inc(s,n);
|
||||
calc_divconst_magic_unsigned(32,d,m,m_add,s);
|
||||
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
|
||||
emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
|
||||
cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
|
||||
emit_reg(A_MUL,S_L,hreg1);
|
||||
if a<>0 then
|
||||
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
|
||||
if m_add then
|
||||
begin
|
||||
{
|
||||
printf ("; dividend: register other than EAX or memory location\n");
|
||||
printf ("\n");
|
||||
printf ("MOV EAX, 0%08lXh\n", m);
|
||||
printf ("MUL dividend\n");
|
||||
printf ("ADD EAX, 0%08lXh\n", m);
|
||||
printf ("ADC EDX, 0\n");
|
||||
if (s) printf ("SHR EDX, %d\n", s);
|
||||
printf ("\n");
|
||||
printf ("; quotient now in EDX\n");
|
||||
}
|
||||
emit_const_reg(A_ADD,S_L,aint(m),NR_EAX);
|
||||
emit_const_reg(A_ADC,S_L,0,NR_EDX);
|
||||
emit_reg_reg(A_SUB,S_L,NR_EDX,hreg1);
|
||||
emit_const_reg(A_SHR,S_L,1,hreg1);
|
||||
emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX);
|
||||
dec(s);
|
||||
end;
|
||||
if s<>0 then
|
||||
emit_const_reg(A_SHR,S_L,aint(s),NR_EDX);
|
||||
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
|
||||
cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
|
||||
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
|
||||
cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
|
||||
end;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user