fpc/rtl/i8086/int32p.inc

{
    This file is part of the Free Pascal run time library.
    Copyright (c) 2013 by the Free Pascal development team

    This file contains some helper routines for longint and dword

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}

{$define FPC_SYSTEM_HAS_MUL_DWORD}
function fpc_mul_dword( f1, f2: dword ): dword; [public,alias: 'FPC_MUL_DWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov

  32-bit multiplications summary:
  f1 = A1*$10000+A0
  f2 = B1*$10000+B0
  (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0*B0)

  A1*B1 [only needed for overflow checking; overflow if <>0]
  A1*B0
  A0*B1
  A0:B0
}
  asm
    mov     cx,word[f1]
    mov     ax,word[f1+2]
    mov     di,word[f2]
    mov     si,word[f2+2]
    mul     di
    xchg    ax,si
    mul     cx
    add     si,ax
    xchg    ax,di  { 1 byte shorter than mov }
    mul     cx
    add     dx,si
    mov     word[result],ax
    mov     word[result+2],dx
  end [ 'ax','cx','dx','si','di' ];
end;


function fpc_mul_dword_checkoverflow( f1, f2: dword ): dword; [public,alias: 'FPC_MUL_DWORD_CHECKOVERFLOW']; compilerproc;
begin
{ routine contributed by Max Nazhalov

  32-bit multiplications summary:
  f1 = A1*$10000+A0
  f2 = B1*$10000+B0
  (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0*B0)

  A1*B1 [only needed for overflow checking; overflow if <>0]
  A1*B0
  A0*B1
  A0:B0
}
  asm
    mov     cx,word[f1]
    mov     ax,word[f1+2]
    mov     di,word[f2]
    mov     si,word[f2+2]
    test    ax,ax
    jz      @@skip
    test    si,si
    jnz     @@overflow
    mul     di
    test    dx,dx
    jnz     @@overflow
@@skip:
    xchg    ax,si
    mul     cx
    test    dx,dx
    jnz     @@overflow
    add     si,ax
    jc      @@overflow
    xchg    ax,di  { 1 byte shorter than mov }
    mul     cx
    add     dx,si
    jc      @@overflow
    // checked and succeed
    mov     word[result],ax
    mov     word[result+2],dx
    jmp     @@done
@@overflow:
    call    FPC_OVERFLOW
@@done:
  end [ 'ax','cx','dx','si','di' ];
end;


{$define FPC_SYSTEM_HAS_DIV_DWORD}
function fpc_div_dword( n, z: dword ): dword; [public, alias:'FPC_DIV_DWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov }
  result := 0;
  if n=0 then
    HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
  asm
      mov    ax,word [z]
      mov    dx,word [z+2]
      mov    bx,word [n]
      mov    cx,word [n+2]
      // check for underflow: z<n
      mov    si,dx
      cmp    ax,bx
      sbb    si,cx
      jc     @@3
      // select one of 3 trivial cases
      test   cx,cx
      jnz    @@1
      cmp    dx,bx
      jnc    @@0
      // (i) single division: n<=0xFFFF, z<=(n<<16)-1
      div    bx
      mov    word [result],ax
      jmp    @@3
@@0:  // (ii) two divisions: n<=0xFFFF, z>(n<<16)-1
      //  q1 := [0:z1] div n; r := [0:z1] mod n;
      //  q0 := [r:z0] div n;
      xchg   ax,cx
      xchg   ax,dx
    { dx=0, ax=z1, cx=z0 }
      div    bx
      xchg   ax,cx
    { dx=r, ax=z0, cx=q1 }
      div    bx
      mov    word [result],ax
      mov    word [result+2],cx
      jmp    @@3
@@1:  // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF)
      // Special case of the generic "schoolbook" division [see e.g. Knuth]:
      //  1. normalize divisor: [n1:n0] := n<<m, so that 0x8000<=n1<=0xFFFF
      //     n>=0x10000 -> m<=15
      //  2. adjust divident accordingly: [z2:z1:z0] := z<<m
      //     m<=15 -> z2<=0x7FFF
      // implementation: instead do >> dropping n0 and z0
      mov    si,bx // save n0
      mov    di,cx // save n1
      test   ch,ch
      jz     @@2
      mov    bl,bh
      mov    bh,cl
      mov    cl,ch
      mov    al,ah
      mov    ah,dl
      mov    dl,dh
      xor    dh,dh
@@2:  // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1
      shr    cl,1
      rcr    bx,1
      shr    dx,1
      rcr    ax,1
      test   cl,cl
      jnz    @@2
      //  3. estimate quotient: q_hat := [z2:z1]/n1
      //     Division never overflows since z2<=0x7FFF and n1>0x7FFF
      div    bx
      //  4. multiply & subtract calculating remainder:
      //     r := z-n*q_hat (z and n are original)
      //  5. adjust quotient: while (r<0) do { q_hat-=1; r+=n };
      //     theoretically, 0..2 iterations are required [see e.g. Knuth];
      //     in practice, with such initial data, at most one iteration
      //     is needed (no disproof has been found yet; and if it will
      //     ever be found -- it also should raise doubts about the i386
      //     fpc_div_qword helper again; see FPC mantis #23963)
      mov    cx,ax // save q_hat
      mul    si
      mov    bx,ax
      mov    si,dx
      mov    ax,cx
      mul    di
      xor    di,di
      add    ax,si
      adc    dx,di // [dx:ax:bx] := n*q_hat; di=0
      mov    si,word [z]
      sub    si,bx
      mov    si,word [z+2]
      sbb    si,ax
      sbb    di,dx
      sbb    cx,0
      //  6. done: q := [0:cx]
      mov    word [result],cx
@@3:
  end;
end;


{$define FPC_SYSTEM_HAS_MOD_DWORD}
function fpc_mod_dword( n, z: dword ): dword; [public, alias:'FPC_MOD_DWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov }
  result := z;
  if n=0 then
    HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
  asm
      mov    ax,word [z]
      mov    dx,word [z+2]
      mov    bx,word [n]
      mov    cx,word [n+2]
      // check for underflow: z<n
      mov    si,dx
      cmp    ax,bx
      sbb    si,cx
      jc     @@4
      // select one of 3 trivial cases
      test   cx,cx
      jnz    @@1
      cmp    dx,bx
      jnc    @@0
      // (i) single division: n<=0xFFFF, z<=(n<<16)-1
      div    bx
      jmp    @@3 // r=cx:dx (cx=0)
@@0:  // (ii) two divisions: n<=0xFFFF, z>(n<<16)-1
      //  q1 := [0:z1] div n; r := [0:z1] mod n;
      //  q0 := [r:z0] div n; r := [r:z0] mod n;
      xchg   ax,cx
      xchg   ax,dx
    { dx=0, ax=z1, cx=z0 }
      div    bx
      mov    ax,cx
      xor    cx,cx
    { dx=r, ax=z0, cx=0 }
      div    bx
      jmp    @@3 // r=cx:dx (cx=0)
@@1:  // (iii) long divisor: n>=0x10000 (hence q<=0xFFFF)
      // Special case of the generic "schoolbook" division [see e.g. Knuth]:
      //  1. normalize divisor: [n1:n0] := n<<m, so that 0x8000<=n1<=0xFFFF
      //     n>=0x10000 -> m<=15
      //  2. adjust divident accordingly: [z2:z1:z0] := z<<m
      //     m<=15 -> z2<=0x7FFF
      // implementation: instead do >> dropping n0 and z0
      mov    si,bx // save n0
      mov    di,cx // save n1
      test   ch,ch
      jz     @@2
      mov    bl,bh
      mov    bh,cl
      mov    cl,ch
      mov    al,ah
      mov    ah,dl
      mov    dl,dh
      xor    dh,dh
@@2:  // repeat >> 1..8 times resulting in [dx:ax]=[z2:z1] and bx=n1
      shr    cl,1
      rcr    bx,1
      shr    dx,1
      rcr    ax,1
      test   cl,cl
      jnz    @@2
      //  3. estimate quotient: q_hat := [z2:z1]/n1
      //     Division never overflows since z2<=0x7FFF and n1>0x7FFF
      div    bx
      //  4. multiply & subtract calculating remainder:
      //     r := z-n*q_hat (z and n are original)
      //  5. adjust quotient: while (r<0) do { q_hat-=1; r+=n };
      //     theoretically, 0..2 iterations are required [see e.g. Knuth];
      //     in practice, with such initial data, at most one iteration
      //     is needed (no disproof has been found yet; and if it will
      //     ever be found -- it also should raise doubts about the i386
      //     fpc_div_qword helper again; see FPC mantis #23963)
      mov    cx,ax // save q_hat
      mul    si
      mov    bx,ax
      mov    si,dx
      mov    ax,cx
      mul    di
      xor    di,di
      add    ax,si
      adc    dx,di // [dx:ax:bx] := n*q_hat; di=0
      mov    si,word [z]
      mov    cx,word [z+2]
      sub    si,bx
      sbb    cx,ax
      sbb    di,dx
      mov    dx,si
      jnc    @@3
      add    dx,word [n]
      adc    cx,word [n+2]
@@3:  // done: r=cx:dx
      mov    word [result],dx
      mov    word [result+2],cx
@@4:
  end;
end;