{

    This file is part of the Free Pascal run time library.
    Copyright (c) 2003 by the Free Pascal development team.

    Implementation of mathematical Routines (for extended type)

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}


{****************************************************************************
                            FPU Control word
 ****************************************************************************}

    procedure Set8087CW(cw:word);
      begin
        { pic-safe ; cw will not be a regvar because it's accessed from }
        { assembler                                                     }
        default8087cw:=cw;
        asm
          fnclex
          fldcw cw
        end;
      end;


    function Get8087CW:word;assembler;
      asm
        push ax
	mov bx, sp
	fnstcw word ptr ss:[bx]
	pop ax
      end;

    procedure Handle_I8086_Error(InterruptNumber : dword); public name 'FPC_HANDLE_I8086_ERROR';
      var
        FpuStatus : word;
        OutError : byte;
      begin
        OutError:=InterruptNumber;
        case byte(InterruptNumber) of
         0 : OutError:=200;    {'Division by Zero'}
         5 : OutError:=201;    {'Bounds Check', not caught yet }
         12 : OutError:=202;   {'Stack Fault', not caught yet }
         7,                    {'Coprocessor not available', not caught yet }
         9,                    {'Coprocessor overrun', not caught yet }
         16,$75 :
          begin
            { This needs special handling }
            { to discriminate between 205,206 and 207 }
            asm
              fnstsw fpustatus { This for is available for 8086 already }
              fnclex
            end;
            if (FpuStatus and FPU_Invalid)<>0 then
              OutError:=216
            else if (FpuStatus and FPU_Denormal)<>0 then
              OutError:=216
            else if (FpuStatus and FPU_DivisionByZero)<>0 then
              OutError:=208
            else if (FpuStatus and FPU_Overflow)<>0 then
              OutError:=205
            else if (FpuStatus and FPU_Underflow)<>0 then
              OutError:=206
            else
              OutError:=207;  {'Coprocessor Error'}
            { if exceptions then Reset FPU and reload control word }
            if (FPUStatus and FPU_ExceptionMask)<>0 then
              SysResetFPU;
          end;
        end;
        HandleError(OutError);
      end;


{****************************************************************************
                       EXTENDED data type routines
 ****************************************************************************}

    {$define FPC_SYSTEM_HAS_ABS}
    function fpc_abs_real(d : ValReal) : ValReal;compilerproc;
    begin
      { Function is handled internal in the compiler }
      runerror(207);
      result:=0;
    end;
    {$define FPC_SYSTEM_HAS_SQR}
    function fpc_sqr_real(d : ValReal) : ValReal;compilerproc;
    begin
      { Function is handled internal in the compiler }
      runerror(207);
      result:=0;
    end;
    {$define FPC_SYSTEM_HAS_SQRT}
    function fpc_sqrt_real(d : ValReal) : ValReal;compilerproc;
    begin
      { Function is handled internal in the compiler }
      runerror(207);
      result:=0;
    end;
    {$define FPC_SYSTEM_HAS_LN}
    function fpc_ln_real(d : ValReal) : ValReal;compilerproc;
    begin
      { Function is handled internal in the compiler }
      runerror(207);
      result:=0;
    end;


    const
      { the exact binary representation of pi (as generated by the fldpi instruction),
        and then divided by 2 and 4. I've tested the following FPUs and they produce
        the exact same values:
          i8087
          Pentium III (Coppermine)
          Athlon 64 (K8)
        }
      Extended_PIO2: array [0..4] of word=($C235,$2168,$DAA2,$C90F,$3FFF);  { pi/2 }
      Extended_PIO4: array [0..4] of word=($C235,$2168,$DAA2,$C90F,$3FFE);  { pi/4 }

    {$define FPC_SYSTEM_HAS_ARCTAN}
    function fpc_arctan_real(d : ValReal) : ValReal;assembler;compilerproc;
      var
        sw: word;
      asm
        { the fpatan instruction on the 8087 and 80287 has the following restrictions:
            0 <= ST(1) < ST(0) < +inf
          which makes it useful only for calculating arctan in the range:
            0 <= d < 1
          so in order to cover the full range, we use the following properties of arctan:
            arctan(1)  = pi/4
            arctan(-d) = -arctan(d)
            arctan(d)  = pi/2 - arctan(1/d), if d>0
        }
        fld tbyte [d]
        ftst
        fstsw sw
        mov ah, byte [sw + 1]
        sahf
        jb @@negative

        { d >= 0 }
        fld1  // 1 d
        fcom
        fstsw sw
        mov ah, byte [sw + 1]
        sahf
        jb @@greater_than_one
        jz @@equal_to_one

        { 0 <= d < 1 }
        fpatan
        jmp @@done

@@greater_than_one:
        { d > 1 }
        fxch st(1)                 // d                1
        fpatan                     // arctan(1/d)
        fld tbyte [Extended_PIO2]  // pi/2             arctan(1/d)
        fsubrp st(1), st           // pi/2-arctan(1/d)
        jmp @@done

@@equal_to_one:
        { d = 1, return pi/4 }
        fstp st
        fstp st
        fld tbyte [Extended_PIO4]
        jmp @@done

@@negative:
        { d < 0; -d > 0 }
        fchs                       // -d
        fld1                       // 1   -d
        fcom
        fstsw sw
        mov ah, byte [sw + 1]
        sahf
        jb @@less_than_minus_one
        jz @@equal_to_minus_one

        { -1 < d < 0; 0 < -d < 1 }
        fpatan                     // arctan(-d)
        fchs                       // -arctan(-d)
        jmp @@done

@@equal_to_minus_one:
        { d = -1, return -pi/4 }
        fstp st
        fstp st
        fld tbyte [Extended_PIO4]
        fchs
        jmp @@done

@@less_than_minus_one:
        { d < -1; -d > 1 }
        fxch st(1)                 // -d                1
        fpatan                     // arctan(-1/d)
        fld tbyte [Extended_PIO2]  // pi/2              arctan(-1/d)
        fsubp st(1), st            // arctan(-1/d)-pi/2

@@done:
      end;

    {$define FPC_SYSTEM_HAS_EXP}
    function fpc_exp_real(d : ValReal) : ValReal;assembler;compilerproc;
      var
        sw1: word;
      asm
        // comes from DJ GPP
        { fixed for 8087 and 80287 by nickysn
          notable differences between 8087/80287 and 80387:
            f2xm1  on 8087/80287  requires that  0<=st(0)<=0.5
            f2xm1  on      80387+ requires that -1<=st(0)<=1
            fscale on 8087/80287  requires that -2**15<=st(1)<=0 or 1<=st(1)<2**15
            fscale on      80387+ has no restrictions
        }
        fld tbyte[d]                   // d
        fldl2e                         // l2e                   d
        fmulp st(1), st                // l2e*d
        fld st(0)                      // l2e*d                 l2e*d
        frndint                        // round(l2e*d)          l2e*d
        fxch st(1)                     // l2e*d                 round(l2e*d)
        fsub st, st(1)                 // l2e*d-round(l2e*d)    round(l2e*d)
        ftst                           // l2e*d-round(l2e*d)<0?
        fstsw sw1
        mov ah, byte [sw1 + 1]
        sahf
        jb @@negative

        f2xm1                          // 2**(l2e*d-round(l2e*d))-1   round(l2e*d)
        fld1                           // 1 2**(l2e*d-round(l2e*d))-1 round(l2e*d)
        faddp st(1), st                // 2**(l2e*d-round(l2e*d))     round(l2e*d)
        jmp @@common

@@negative:
        fchs                           // -l2e*d+round(l2e*d)           round(l2e*d)
        f2xm1                          // 2**(-l2e*d+round(l2e*d))-1    round(l2e*d)
        fld1                           // 1  2**(-l2e*d+round(l2e*d))-1 round(l2e*d)
        fadd st(1), st                 // 1  2**(-l2e*d+round(l2e*d))   round(l2e*d)
        fdivrp st(1), st               // 2**(l2e*d-round(l2e*d))       round(l2e*d)

@@common:
        fscale                         // (2**(l2e*d-round(l2e*d)))*(2**round(l2e*d))  round(l2e*d)
        fstp st(1)                     // (2**(l2e*d-round(l2e*d)))*(2**round(l2e*d))
     end;

    {$define FPC_SYSTEM_HAS_FRAC}
    function fpc_frac_real(d : ValReal) : ValReal;assembler;compilerproc;
      asm
        sub sp, 2
        mov bx, sp
        fnstcw ss:[bx]
        fwait
        mov cl, ss:[bx+1]
        or byte ss:[bx+1], $0f
        fldcw ss:[bx]
        fld tbyte [d]
        frndint
        fld tbyte [d]
        fsub st, st(1)
        fstp st(1)
        mov ss:[bx+1], cl
        fldcw ss:[bx]
        add sp, 2
      end;

    {$define FPC_SYSTEM_HAS_INT}
    function fpc_int_real(d : ValReal) : ValReal;assembler;compilerproc;
      asm
        sub sp, 2
        mov bx, sp
        fnstcw ss:[bx]
        fwait
        mov cl, byte ss:[bx+1]
        or byte ss:[bx+1], $0f
        fldcw ss:[bx]
        fwait
        fld tbyte [d]
        frndint
        fwait
        mov byte ss:[bx+1], cl
        fldcw ss:[bx]
        add sp, 2
      end;

    {$define FPC_SYSTEM_HAS_TRUNC}
    function fpc_trunc_real(d : ValReal) : int64;assembler;compilerproc;
      asm
        sub sp, 10
        mov bx, sp
        fld tbyte [d]
        fnstcw ss:[bx]
        mov cl, ss:[bx+1]
        or byte ss:[bx+1], $0f
        fldcw ss:[bx]
        mov ss:[bx+1], cl
        fistp qword ss:[bx+2]
        fldcw ss:[bx]
        fwait
        mov dx, ss:[bx+2]
        mov cx, ss:[bx+4]
        mov ax, ss:[bx+8]
        { store bx as last }
        mov bx, ss:[bx+6]
        add sp, 10
      end;

    {$define FPC_SYSTEM_HAS_ROUND}
    function fpc_round_real(d : ValReal) : int64;assembler;compilerproc;
      var
        tmp: int64;
      asm
        fld    tbyte [d]
        fistp  qword [tmp]
        fwait
        mov  dx, [tmp]
        mov  cx, [tmp+2]
        mov  bx, [tmp+4]
        mov  ax, [tmp+6]
      end;