+ added 32-bit asm optimized division helpers for i8086 by Max Nazhalov

git-svn-id: trunk@26512 -
2025-04-11 18:08:15 +02:00 · 2014-01-18 20:33:30 +00:00 · 2014-01-18 20:33:30 +00:00 · 4a107dcfa6
commit 4a107dcfa6
parent 78e726b34f
1 changed files with 195 additions and 0 deletions
--- a/rtl/i8086/int32p.inc
+++ b/rtl/i8086/int32p.inc
@ -72,3 +72,198 @@ begin
    HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
 end;

+
+{$define FPC_SYSTEM_HAS_DIV_DWORD}
+function fpc_div_dword( n, z: dword ): dword; [public, alias:'FPC_DIV_DWORD']; compilerproc;
+begin
+{ routine contributed by Max Nazhalov }
+  result := 0;
+  if n=0 then
+    HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
+  asm
+      mov    ax,word [z]
+      mov    dx,word [z+2]
+      mov    bx,word [n]
+      mov    cx,word [n+2]
+      // check for underflow: z<n
+      mov    si,dx
+      cmp    ax,bx
+      sbb    si,cx
+      jc     @@3
+      // select one of 3 trivial cases
+      test   cx,cx
+      jnz    @@1
+      cmp    dx,bx
+      jnc    @@0
+      // (i) single division: n<=0xFFFF, z<=(n<<16)-1
+      div    bx
+      mov    word [result],ax
+      jmp    @@3
+@@0:  // (ii) two divisions: n<=0xFFFF, z>(n<<16)-1
+      //  q1 := [0:z1] div n; r := [0:z1] mod n;
+      //  q0 := [r:z0] div n;
+      xchg   ax,cx
+      xchg   ax,dx
+    { dx=0, ax=z1, cx=z0 }
+      div    bx
+      xchg   ax,cx
+    { dx=r, ax=z0, cx=q1 }
+      div    bx
+      mov    word [result],ax
+      mov    word [result+2],cx
+      jmp    @@3
+@@1:  // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF)
+      // Special case of the generic "schoolbook" division [see e.g. Knuth]:
+      //  1. normalize divisor: [n1:n0] := n<<m, so that 0x8000<=n1<=0xFFFF
+      //     n>=0x10000 -> m<=15
+      //  2. adjust divident accordingly: [z2:z1:z0] := z<<m
+      //     m<=15 -> z2<=0x7FFF
+      // implementation: instead do >> dropping n0 and z0
+      mov    si,bx // save n0
+      mov    di,cx // save n1
+      test   ch,ch
+      jz     @@2
+      mov    bl,bh
+      mov    bh,cl
+      mov    cl,ch
+      mov    al,ah
+      mov    ah,dl
+      mov    dl,dh
+      xor    dh,dh
+@@2:  // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1
+      shr    cl,1
+      rcr    bx,1
+      shr    dx,1
+      rcr    ax,1
+      test   cl,cl
+      jnz    @@2
+      //  3. estimate quotient: q_hat := [z2:z1]/n1
+      //     Division never overflows since z2<=0x7FFF and n1>0x7FFF
+      div    bx
+      //  4. multiply & subtract calculating remainder:
+      //     r := z-n*q_hat (z and n are original)
+      //  5. adjust quotient: while (r<0) do { q_hat-=1; r+=n };
+      //     theoretically, 0..2 iterations are required [see e.g. Knuth];
+      //     in practice, with such initial data, at most one iteration
+      //     is needed (no disproof has been found yet; and if it will
+      //     ever be found -- it also should raise doubts about the i386
+      //     fpc_div_qword helper again; see FPC mantis #23963)
+      mov    cx,ax // save q_hat
+      mul    si
+      mov    bx,ax
+      mov    si,dx
+      mov    ax,cx
+      mul    di
+      xor    di,di
+      add    ax,si
+      adc    dx,di // [dx:ax:bx] := n*q_hat; di=0
+      mov    si,word [z]
+      sub    si,bx
+      mov    si,word [z+2]
+      sbb    si,ax
+      sbb    di,dx
+      sbb    cx,0
+      //  6. done: q := [0:cx]
+      mov    word [result],cx
+@@3:
+  end;
+end;
+
+
+{$define FPC_SYSTEM_HAS_MOD_DWORD}
+function fpc_mod_dword( n, z: dword ): dword; [public, alias:'FPC_MOD_DWORD']; compilerproc;
+begin
+{ routine contributed by Max Nazhalov }
+  result := z;
+  if n=0 then
+    HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
+  asm
+      mov    ax,word [z]
+      mov    dx,word [z+2]
+      mov    bx,word [n]
+      mov    cx,word [n+2]
+      // check for underflow: z<n
+      mov    si,dx
+      cmp    ax,bx
+      sbb    si,cx
+      jc     @@4
+      // select one of 3 trivial cases
+      test   cx,cx
+      jnz    @@1
+      cmp    dx,bx
+      jnc    @@0
+      // (i) single division: n<=0xFFFF, z<=(n<<16)-1
+      div    bx
+      jmp    @@3 // r=cx:dx (cx=0)
+@@0:  // (ii) two divisions: n<=0xFFFF, z>(n<<16)-1
+      //  q1 := [0:z1] div n; r := [0:z1] mod n;
+      //  q0 := [r:z0] div n; r := [r:z0] mod n;
+      xchg   ax,cx
+      xchg   ax,dx
+    { dx=0, ax=z1, cx=z0 }
+      div    bx
+      mov    ax,cx
+      xor    cx,cx
+    { dx=r, ax=z0, cx=0 }
+      div    bx
+      jmp    @@3 // r=cx:dx (cx=0)
+@@1:  // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF)
+      // Special case of the generic "schoolbook" division [see e.g. Knuth]:
+      //  1. normalize divisor: [n1:n0] := n<<m, so that 0x8000<=n1<=0xFFFF
+      //     n>=0x10000 -> m<=15
+      //  2. adjust divident accordingly: [z2:z1:z0] := z<<m
+      //     m<=15 -> z2<=0x7FFF
+      // implementation: instead do >> dropping n0 and z0
+      mov    si,bx // save n0
+      mov    di,cx // save n1
+      test   ch,ch
+      jz     @@2
+      mov    bl,bh
+      mov    bh,cl
+      mov    cl,ch
+      mov    al,ah
+      mov    ah,dl
+      mov    dl,dh
+      xor    dh,dh
+@@2:  // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1
+      shr    cl,1
+      rcr    bx,1
+      shr    dx,1
+      rcr    ax,1
+      test   cl,cl
+      jnz    @@2
+      //  3. estimate quotient: q_hat := [z2:z1]/n1
+      //     Division never overflows since z2<=0x7FFF and n1>0x7FFF
+      div    bx
+      //  4. multiply & subtract calculating remainder:
+      //     r := z-n*q_hat (z and n are original)
+      //  5. adjust quotient: while (r<0) do { q_hat-=1; r+=n };
+      //     theoretically, 0..2 iterations are required [see e.g. Knuth];
+      //     in practice, with such initial data, at most one iteration
+      //     is needed (no disproof has been found yet; and if it will
+      //     ever be found -- it also should raise doubts about the i386
+      //     fpc_div_qword helper again; see FPC mantis #23963)
+      mov    cx,ax // save q_hat
+      mul    si
+      mov    bx,ax
+      mov    si,dx
+      mov    ax,cx
+      mul    di
+      xor    di,di
+      add    ax,si
+      adc    dx,di // [dx:ax:bx] := n*q_hat; di=0
+      mov    si,word [z]
+      mov    cx,word [z+2]
+      sub    si,bx
+      sbb    cx,ax
+      sbb    di,dx
+      mov    dx,si
+      jnc    @@3
+      add    dx,word [n]
+      adc    cx,word [n+2]
+@@3:  // done: r=cx:dx
+      mov    word [result],dx
+      mov    word [result+2],cx
+@@4:
+  end;
+end;