* patch from Max Nazhalov with improvements to the i8086 64-bit division:

since [unnormalized] n >= 0x10000 now, we have
   1. lzv<=47, so all code that makes shifts by >= 48 is dead;
   2. q3=0, so main loop can be executed just 3 times instead of 4.

git-svn-id: trunk@26535 -
This commit is contained in:
nickysn 2014-01-20 21:39:05 +00:00
parent 8b3217815b
commit a85bb98aa6

View File

@ -232,10 +232,10 @@ function fpc_div_qword( n, z: qword ): qword; [public, alias:'FPC_DIV_QWORD']; c
// see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
var var
dig: byte; dig: byte;
u: array [0..7] of word; u: array [0..6] of word;
begin begin
asm asm
mov dig,4 mov dig,3 // quotient contains 3 digits for "long" division path
// Check parameters // Check parameters
mov dx,word [n] mov dx,word [n]
mov cx,word [n+2] mov cx,word [n+2]
@ -284,6 +284,8 @@ begin
jmp @@q jmp @@q
@@n0: // D1. Normalize divisor: @@n0: // D1. Normalize divisor:
// n := n shl lzv, so that 2^63<=n<2^64 // n := n shl lzv, so that 2^63<=n<2^64
// Note: n>=0x10000 leads to lzv<=47 and q3=0
mov word [result+6],si // q3:=0
mov di,si mov di,si
test ax,ax test ax,ax
jnz @@n2 jnz @@n2
@ -316,7 +318,7 @@ begin
mov word [n+4],bx mov word [n+4],bx
mov word [n+6],ax mov word [n+6],ax
// Adjust divident accordingly: // Adjust divident accordingly:
// u := uint128(z) shl lzv; lzv=si=0..63; di=0 // u := uint128(z) shl lzv; lzv=si=0..47; di=0
mov dx,word [z] mov dx,word [z]
mov cx,word [z+2] mov cx,word [z+2]
mov bx,word [z+4] mov bx,word [z+4]
@ -347,25 +349,12 @@ begin
dec si dec si
jnz @@m1 jnz @@m1
@@m2: // si=0, bp=lzv @@m2: // si=0, bp=lzv
// di:ax:bx:cx:dx shifted by 0..15; 0|16|32|48 shifts remain // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
sub bp,16 sub bp,16
jc @@m5 jc @@m5
sub bp,16 sub bp,16
jc @@m4 jc @@m4
sub bp,16 // << 32
jc @@m3
// << 48
pop bp
mov word [u],si
mov word [u+2],si
mov word [u+4],si
mov word [u+6],dx
mov word [u+8],cx
mov word [u+10],bx
mov word [u+12],ax
mov word [u+14],di
jmp @@m6
@@m3: // << 32
pop bp pop bp
mov word [u],si mov word [u],si
mov word [u+2],si mov word [u+2],si
@ -374,7 +363,6 @@ begin
mov word [u+8],bx mov word [u+8],bx
mov word [u+10],ax mov word [u+10],ax
mov word [u+12],di mov word [u+12],di
mov word [u+14],si
jmp @@m6 jmp @@m6
@@m4: // << 16 @@m4: // << 16
pop bp pop bp
@ -385,7 +373,6 @@ begin
mov word [u+8],ax mov word [u+8],ax
mov word [u+10],di mov word [u+10],di
mov word [u+12],si mov word [u+12],si
mov word [u+14],si
jmp @@m6 jmp @@m6
@@m5: // << 0 @@m5: // << 0
pop bp pop bp
@ -396,10 +383,9 @@ begin
mov word [u+8],di mov word [u+8],di
mov word [u+10],si mov word [u+10],si
mov word [u+12],si mov word [u+12],si
mov word [u+14],si @@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j], bx:=@q[j]
@@m6: // D2. Start from j:=3, si:=@u[j], bx:=@q[j] lea si,word [u+4]
lea si,word [u+6] lea bx,word [result+4]
lea bx,word [result+6]
@@d0: push bx @@d0: push bx
// D3. Estimate the next quotient digit: // D3. Estimate the next quotient digit:
// q_hat := [u(j+4):u(j+3)]/[n3] // q_hat := [u(j+4):u(j+3)]/[n3]
@ -478,10 +464,10 @@ function fpc_mod_qword( n, z: qword ): qword; [public, alias:'FPC_MOD_QWORD']; c
var var
dig: byte; dig: byte;
lzv: word; lzv: word;
u: array [0..7] of word; u: array [0..6] of word;
begin begin
asm asm
mov dig,4 mov dig,3 // quotient contains 3 digist for "long" division path
// Check parameters // Check parameters
mov dx,word [n] mov dx,word [n]
mov cx,word [n+2] mov cx,word [n+2]
@ -530,6 +516,7 @@ begin
jmp @@r6 jmp @@r6
@@n0: // D1. Normalize divisor: @@n0: // D1. Normalize divisor:
// n := n shl lzv, so that 2^63<=n<2^64 // n := n shl lzv, so that 2^63<=n<2^64
// Note: n>=0x10000 leads to lzv<=47
mov di,si mov di,si
test ax,ax test ax,ax
jnz @@n2 jnz @@n2
@ -563,7 +550,7 @@ begin
mov word [n+6],ax mov word [n+6],ax
mov lzv,si mov lzv,si
// Adjust divident accordingly: // Adjust divident accordingly:
// u := uint128(z) shl lzv; lzv=si=0..63; di=0 // u := uint128(z) shl lzv; lzv=si=0..47; di=0
mov dx,word [z] mov dx,word [z]
mov cx,word [z+2] mov cx,word [z+2]
mov bx,word [z+4] mov bx,word [z+4]
@ -594,25 +581,12 @@ begin
dec si dec si
jnz @@m1 jnz @@m1
@@m2: // si=0, bp=lzv @@m2: // si=0, bp=lzv
// di:ax:bx:cx:dx shifted by 0..15; 0|16|32|48 shifts remain // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
sub bp,16 sub bp,16
jc @@m5 jc @@m5
sub bp,16 sub bp,16
jc @@m4 jc @@m4
sub bp,16 // << 32
jc @@m3
// << 48
pop bp
mov word [u],si
mov word [u+2],si
mov word [u+4],si
mov word [u+6],dx
mov word [u+8],cx
mov word [u+10],bx
mov word [u+12],ax
mov word [u+14],di
jmp @@m6
@@m3: // << 32
pop bp pop bp
mov word [u],si mov word [u],si
mov word [u+2],si mov word [u+2],si
@ -621,7 +595,6 @@ begin
mov word [u+8],bx mov word [u+8],bx
mov word [u+10],ax mov word [u+10],ax
mov word [u+12],di mov word [u+12],di
mov word [u+14],si
jmp @@m6 jmp @@m6
@@m4: // << 16 @@m4: // << 16
pop bp pop bp
@ -632,7 +605,6 @@ begin
mov word [u+8],ax mov word [u+8],ax
mov word [u+10],di mov word [u+10],di
mov word [u+12],si mov word [u+12],si
mov word [u+14],si
jmp @@m6 jmp @@m6
@@m5: // << 0 @@m5: // << 0
pop bp pop bp
@ -643,9 +615,8 @@ begin
mov word [u+8],di mov word [u+8],di
mov word [u+10],si mov word [u+10],si
mov word [u+12],si mov word [u+12],si
mov word [u+14],si @@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j]
@@m6: // D2. Start from j:=3, si:=@u[j] lea si,word [u+4]
lea si,word [u+6]
@@d0: // D3. Estimate the next quotient digit: @@d0: // D3. Estimate the next quotient digit:
// q_hat := [u(j+4):u(j+3)]/[n3] // q_hat := [u(j+4):u(j+3)]/[n3]
// use max.possible q_hat if division overflows // use max.possible q_hat if division overflows
@ -711,14 +682,7 @@ begin
jc @@r2 jc @@r2
sub si,16 sub si,16
jc @@r1 jc @@r1
sub si,16 // >> 32..47
jc @@r0
// >> 48..63
mov bx,ax
mov cx,ax
mov dx,word [u+6]
jmp @@r3
@@r0: // >> 32..47
mov bx,ax mov bx,ax
mov cx,word [u+6] mov cx,word [u+6]
mov dx,word [u+4] mov dx,word [u+4]