From 4a107dcfa639977d9aa36577410ab187334f357d Mon Sep 17 00:00:00 2001 From: nickysn Date: Sat, 18 Jan 2014 20:33:30 +0000 Subject: [PATCH] + added 32-bit asm optimized division helpers for i8086 by Max Nazhalov git-svn-id: trunk@26512 - --- rtl/i8086/int32p.inc | 195 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/rtl/i8086/int32p.inc b/rtl/i8086/int32p.inc index f92fc354be..99a4ae37da 100644 --- a/rtl/i8086/int32p.inc +++ b/rtl/i8086/int32p.inc @@ -72,3 +72,198 @@ begin HandleErrorAddrFrameInd(215,get_pc_addr,get_frame); end; + +{$define FPC_SYSTEM_HAS_DIV_DWORD} +function fpc_div_dword( n, z: dword ): dword; [public, alias:'FPC_DIV_DWORD']; compilerproc; +begin +{ routine contributed by Max Nazhalov } + result := 0; + if n=0 then + HandleErrorAddrFrameInd(200,get_pc_addr,get_frame); + asm + mov ax,word [z] + mov dx,word [z+2] + mov bx,word [n] + mov cx,word [n+2] + // check for underflow: z(n<<16)-1 + // q1 := [0:z1] div n; r := [0:z1] mod n; + // q0 := [r:z0] div n; + xchg ax,cx + xchg ax,dx + { dx=0, ax=z1, cx=z0 } + div bx + xchg ax,cx + { dx=r, ax=z0, cx=q1 } + div bx + mov word [result],ax + mov word [result+2],cx + jmp @@3 +@@1: // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF) + // Special case of the generic "schoolbook" division [see e.g. Knuth]: + // 1. normalize divisor: [n1:n0] := n<=0x10000 -> m<=15 + // 2. adjust divident accordingly: [z2:z1:z0] := z< z2<=0x7FFF + // implementation: instead do >> dropping n0 and z0 + mov si,bx // save n0 + mov di,cx // save n1 + test ch,ch + jz @@2 + mov bl,bh + mov bh,cl + mov cl,ch + mov al,ah + mov ah,dl + mov dl,dh + xor dh,dh +@@2: // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1 + shr cl,1 + rcr bx,1 + shr dx,1 + rcr ax,1 + test cl,cl + jnz @@2 + // 3. estimate quotient: q_hat := [z2:z1]/n1 + // Division never overflows since z2<=0x7FFF and n1>0x7FFF + div bx + // 4. multiply & subtract calculating remainder: + // r := z-n*q_hat (z and n are original) + // 5. adjust quotient: while (r<0) do { q_hat-=1; r+=n }; + // theoretically, 0..2 iterations are required [see e.g. Knuth]; + // in practice, with such initial data, at most one iteration + // is needed (no disproof has been found yet; and if it will + // ever be found -- it also should raise doubts about the i386 + // fpc_div_qword helper again; see FPC mantis #23963) + mov cx,ax // save q_hat + mul si + mov bx,ax + mov si,dx + mov ax,cx + mul di + xor di,di + add ax,si + adc dx,di // [dx:ax:bx] := n*q_hat; di=0 + mov si,word [z] + sub si,bx + mov si,word [z+2] + sbb si,ax + sbb di,dx + sbb cx,0 + // 6. done: q := [0:cx] + mov word [result],cx +@@3: + end; +end; + + +{$define FPC_SYSTEM_HAS_MOD_DWORD} +function fpc_mod_dword( n, z: dword ): dword; [public, alias:'FPC_MOD_DWORD']; compilerproc; +begin +{ routine contributed by Max Nazhalov } + result := z; + if n=0 then + HandleErrorAddrFrameInd(200,get_pc_addr,get_frame); + asm + mov ax,word [z] + mov dx,word [z+2] + mov bx,word [n] + mov cx,word [n+2] + // check for underflow: z(n<<16)-1 + // q1 := [0:z1] div n; r := [0:z1] mod n; + // q0 := [r:z0] div n; r := [r:z0] mod n; + xchg ax,cx + xchg ax,dx + { dx=0, ax=z1, cx=z0 } + div bx + mov ax,cx + xor cx,cx + { dx=r, ax=z0, cx=0 } + div bx + jmp @@3 // r=cx:dx (cx=0) +@@1: // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF) + // Special case of the generic "schoolbook" division [see e.g. Knuth]: + // 1. normalize divisor: [n1:n0] := n<=0x10000 -> m<=15 + // 2. adjust divident accordingly: [z2:z1:z0] := z< z2<=0x7FFF + // implementation: instead do >> dropping n0 and z0 + mov si,bx // save n0 + mov di,cx // save n1 + test ch,ch + jz @@2 + mov bl,bh + mov bh,cl + mov cl,ch + mov al,ah + mov ah,dl + mov dl,dh + xor dh,dh +@@2: // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1 + shr cl,1 + rcr bx,1 + shr dx,1 + rcr ax,1 + test cl,cl + jnz @@2 + // 3. estimate quotient: q_hat := [z2:z1]/n1 + // Division never overflows since z2<=0x7FFF and n1>0x7FFF + div bx + // 4. multiply & subtract calculating remainder: + // r := z-n*q_hat (z and n are original) + // 5. adjust quotient: while (r<0) do { q_hat-=1; r+=n }; + // theoretically, 0..2 iterations are required [see e.g. Knuth]; + // in practice, with such initial data, at most one iteration + // is needed (no disproof has been found yet; and if it will + // ever be found -- it also should raise doubts about the i386 + // fpc_div_qword helper again; see FPC mantis #23963) + mov cx,ax // save q_hat + mul si + mov bx,ax + mov si,dx + mov ax,cx + mul di + xor di,di + add ax,si + adc dx,di // [dx:ax:bx] := n*q_hat; di=0 + mov si,word [z] + mov cx,word [z+2] + sub si,bx + sbb cx,ax + sbb di,dx + mov dx,si + jnc @@3 + add dx,word [n] + adc cx,word [n+2] +@@3: // done: r=cx:dx + mov word [result],dx + mov word [result+2],cx +@@4: + end; +end;