+ added 32-bit asm optimized division helpers for i8086 by Max Nazhalov

git-svn-id: trunk@26512 -
This commit is contained in:
nickysn 2014-01-18 20:33:30 +00:00
parent 78e726b34f
commit 4a107dcfa6

View File

@ -72,3 +72,198 @@ begin
HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
end;
{$define FPC_SYSTEM_HAS_DIV_DWORD}
function fpc_div_dword( n, z: dword ): dword; [public, alias:'FPC_DIV_DWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov }
result := 0;
if n=0 then
HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
asm
mov ax,word [z]
mov dx,word [z+2]
mov bx,word [n]
mov cx,word [n+2]
// check for underflow: z<n
mov si,dx
cmp ax,bx
sbb si,cx
jc @@3
// select one of 3 trivial cases
test cx,cx
jnz @@1
cmp dx,bx
jnc @@0
// (i) single division: n<=0xFFFF, z<=(n<<16)-1
div bx
mov word [result],ax
jmp @@3
@@0: // (ii) two divisions: n<=0xFFFF, z>(n<<16)-1
// q1 := [0:z1] div n; r := [0:z1] mod n;
// q0 := [r:z0] div n;
xchg ax,cx
xchg ax,dx
{ dx=0, ax=z1, cx=z0 }
div bx
xchg ax,cx
{ dx=r, ax=z0, cx=q1 }
div bx
mov word [result],ax
mov word [result+2],cx
jmp @@3
@@1: // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF)
// Special case of the generic "schoolbook" division [see e.g. Knuth]:
// 1. normalize divisor: [n1:n0] := n<<m, so that 0x8000<=n1<=0xFFFF
// n>=0x10000 -> m<=15
// 2. adjust divident accordingly: [z2:z1:z0] := z<<m
// m<=15 -> z2<=0x7FFF
// implementation: instead do >> dropping n0 and z0
mov si,bx // save n0
mov di,cx // save n1
test ch,ch
jz @@2
mov bl,bh
mov bh,cl
mov cl,ch
mov al,ah
mov ah,dl
mov dl,dh
xor dh,dh
@@2: // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1
shr cl,1
rcr bx,1
shr dx,1
rcr ax,1
test cl,cl
jnz @@2
// 3. estimate quotient: q_hat := [z2:z1]/n1
// Division never overflows since z2<=0x7FFF and n1>0x7FFF
div bx
// 4. multiply & subtract calculating remainder:
// r := z-n*q_hat (z and n are original)
// 5. adjust quotient: while (r<0) do { q_hat-=1; r+=n };
// theoretically, 0..2 iterations are required [see e.g. Knuth];
// in practice, with such initial data, at most one iteration
// is needed (no disproof has been found yet; and if it will
// ever be found -- it also should raise doubts about the i386
// fpc_div_qword helper again; see FPC mantis #23963)
mov cx,ax // save q_hat
mul si
mov bx,ax
mov si,dx
mov ax,cx
mul di
xor di,di
add ax,si
adc dx,di // [dx:ax:bx] := n*q_hat; di=0
mov si,word [z]
sub si,bx
mov si,word [z+2]
sbb si,ax
sbb di,dx
sbb cx,0
// 6. done: q := [0:cx]
mov word [result],cx
@@3:
end;
end;
{$define FPC_SYSTEM_HAS_MOD_DWORD}
function fpc_mod_dword( n, z: dword ): dword; [public, alias:'FPC_MOD_DWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov }
result := z;
if n=0 then
HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
asm
mov ax,word [z]
mov dx,word [z+2]
mov bx,word [n]
mov cx,word [n+2]
// check for underflow: z<n
mov si,dx
cmp ax,bx
sbb si,cx
jc @@4
// select one of 3 trivial cases
test cx,cx
jnz @@1
cmp dx,bx
jnc @@0
// (i) single division: n<=0xFFFF, z<=(n<<16)-1
div bx
jmp @@3 // r=cx:dx (cx=0)
@@0: // (ii) two divisions: n<=0xFFFF, z>(n<<16)-1
// q1 := [0:z1] div n; r := [0:z1] mod n;
// q0 := [r:z0] div n; r := [r:z0] mod n;
xchg ax,cx
xchg ax,dx
{ dx=0, ax=z1, cx=z0 }
div bx
mov ax,cx
xor cx,cx
{ dx=r, ax=z0, cx=0 }
div bx
jmp @@3 // r=cx:dx (cx=0)
@@1: // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF)
// Special case of the generic "schoolbook" division [see e.g. Knuth]:
// 1. normalize divisor: [n1:n0] := n<<m, so that 0x8000<=n1<=0xFFFF
// n>=0x10000 -> m<=15
// 2. adjust divident accordingly: [z2:z1:z0] := z<<m
// m<=15 -> z2<=0x7FFF
// implementation: instead do >> dropping n0 and z0
mov si,bx // save n0
mov di,cx // save n1
test ch,ch
jz @@2
mov bl,bh
mov bh,cl
mov cl,ch
mov al,ah
mov ah,dl
mov dl,dh
xor dh,dh
@@2: // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1
shr cl,1
rcr bx,1
shr dx,1
rcr ax,1
test cl,cl
jnz @@2
// 3. estimate quotient: q_hat := [z2:z1]/n1
// Division never overflows since z2<=0x7FFF and n1>0x7FFF
div bx
// 4. multiply & subtract calculating remainder:
// r := z-n*q_hat (z and n are original)
// 5. adjust quotient: while (r<0) do { q_hat-=1; r+=n };
// theoretically, 0..2 iterations are required [see e.g. Knuth];
// in practice, with such initial data, at most one iteration
// is needed (no disproof has been found yet; and if it will
// ever be found -- it also should raise doubts about the i386
// fpc_div_qword helper again; see FPC mantis #23963)
mov cx,ax // save q_hat
mul si
mov bx,ax
mov si,dx
mov ax,cx
mul di
xor di,di
add ax,si
adc dx,di // [dx:ax:bx] := n*q_hat; di=0
mov si,word [z]
mov cx,word [z+2]
sub si,bx
sbb cx,ax
sbb di,dx
mov dx,si
jnc @@3
add dx,word [n]
adc cx,word [n+2]
@@3: // done: r=cx:dx
mov word [result],dx
mov word [result+2],cx
@@4:
end;
end;