fpc/rtl/inc/softfpu.pp

{*
===============================================================================
The original notice of the softfloat package is shown below. The conversion
to pascal was done by Carl Eric Codere in 2002 (ccodere@ieee.org).
===============================================================================

This C source file is part of the SoftFloat IEC/IEEE Floating-Point
Arithmetic Package, Release 2a.

Written by John R. Hauser.  This work was made possible in part by the
International Computer Science Institute, located at Suite 600, 1947 Center
Street, Berkeley, California 94704.  Funding was partially provided by the
National Science Foundation under grant MIP-9311980.  The original version
of this code was written as part of a project to build a fixed-point vector
processor in collaboration with the University of California at Berkeley,
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
is available through the Web page
`http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.

THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.

Derivative works are acceptable, even for commercial purposes, so long as
(1) they include prominent notice that the work is derivative, and (2) they
include prominent notice akin to these four paragraphs for those parts of
this code that are retained.

===============================================================================

The float80 and float128 part is translated from the softfloat package
by Florian Klaempfl and contained the following copyright notice

The code might contain some duplicate stuff because the floatx80/float128 port was
done based on the 64 bit enabled softfloat code.

===============================================================================

This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
Package, Release 2b.

Written by John R. Hauser.  This work was made possible in part by the
International Computer Science Institute, located at Suite 600, 1947 Center
Street, Berkeley, California 94704.  Funding was partially provided by the
National Science Foundation under grant MIP-9311980.  The original version
of this code was written as part of a project to build a fixed-point vector
processor in collaboration with the University of California at Berkeley,
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
arithmetic/SoftFloat.html'.

THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.

Derivative works are acceptable, even for commercial purposes, so long as
(1) the source code for the derivative work includes prominent notice that
the work is derivative, and (2) the source code includes prominent notice with
these four paragraphs for those parts of this code that are retained.


===============================================================================
*}

{ $define FPC_SOFTFLOAT_FLOATX80}
{ $define FPC_SOFTFLOAT_FLOAT128}

{ the softfpu unit can be also embedded directly into the system unit }

{$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}

{$mode objfpc}
{$IFNDEF FPC_DOTTEDUNITS}
unit softfpu;
{$ENDIF FPC_DOTTEDUNITS}

{ Overflow checking must be disabled,
  since some operations expect overflow!
}
{$Q-}
{$goto on}

interface
{$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}

{$if not(defined(fpc_softfpu_implementation))}
{
-------------------------------------------------------------------------------
Software IEC/IEEE floating-point types.
-------------------------------------------------------------------------------
}
TYPE
{$ifndef FPC_SYSTEM_HAS_float32}
  float32 = longword;
{$define FPC_SYSTEM_HAS_float32}
{$endif ndef FPC_SYSTEM_HAS_float32}
  { we use here a record in the function header because
    the record allows bitwise conversion to single }
  float32rec = record
    float32 : float32;
  end;

  flag = byte;

  bits8 = byte;
  sbits8 = shortint;
  bits16 = word;
  sbits16 = smallint;
  sbits32 = longint;
  bits32 = longword;
{$ifndef fpc}
  qword = int64;
{$endif}
  { now part of the system unit
  uint64 = qword;
  }
  bits64 = qword;
  sbits64 = int64;

{$ifdef ENDIAN_LITTLE}
{$ifndef FPC_SYSTEM_HAS_float64}
  float64 = record
    case byte of
      // force the record to be aligned like a double
      // else *_to_double will fail for cpus like sparc
      // and avoid expensive unpacking/packing operations
      1: (dummy : double);
      2: (low,high : bits32);
  end;
{$endif ndef FPC_SYSTEM_HAS_float64}

  floatx80 = record
    case byte of
      // force the record to be aligned like a double
      // else *_to_double will fail for cpus like sparc
      // and avoid expensive unpacking/packing operations
      1: (dummy : extended);
      2: (low : qword;high : word);
  end;

  float128 = record
    case byte of
      // force the record to be aligned like a double
      // else *_to_double will fail for cpus like sparc
      // and avoid expensive unpacking/packing operations
      1: (dummy : qword);
      2: (low,high : qword);
  end;
{$else}
{$ifndef FPC_SYSTEM_HAS_float64}
  float64 = record
      case byte of
        // force the record to be aligned like a double
        // else *_to_double will fail for cpus like sparc
        1: (dummy : double);
        2: (high,low : bits32);
  end;
{$endif ndef FPC_SYSTEM_HAS_float64}

  floatx80 = record
    case byte of
      // force the record to be aligned like a double
      // else *_to_double will fail for cpus like sparc
      // and avoid expensive unpacking/packing operations
      1: (dummy : qword);
      2: (high : word;low : qword);
  end;

  float128 = record
    case byte of
      // force the record to be aligned like a double
      // else *_to_double will fail for cpus like sparc
      // and avoid expensive unpacking/packing operations
      1: (dummy : qword);
      2: (high : qword;low : qword);
  end;
{$endif}

{$define FPC_SYSTEM_HAS_float64}


{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_lt(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is less than
or equal to the corresponding value `b', and 0 otherwise.  The comparison
is performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_le(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_eq(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the square root of the double-precision floating-point value `a'.
The operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
function float64_sqrt( a: float64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the remainder of the double-precision floating-point value `a'
with respect to the corresponding value `b'.  The operation is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_rem(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of dividing the double-precision floating-point value `a'
by the corresponding value `b'.  The operation is performed according to the
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_div(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of multiplying the double-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_mul( a: float64; b:float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of subtracting the double-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_sub(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of adding the double-precision floating-point values `a'
and `b'.  The operation is performed according to the IEC/IEEE Standard for
Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_add( a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Rounds the double-precision floating-point value `a' to an integer,
and returns the result as a double-precision floating-point value.  The
operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_round_to_int(a: float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the single-precision floating-point format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_to_float32(a: float64) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic, except that the conversion is always rounded toward zero.
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
the conversion overflows, the largest integer with the same sign as `a' is
returned.
-------------------------------------------------------------------------------
*}
Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic---which means in particular that the conversion is rounded
according to the current rounding mode.  If `a' is a NaN, the largest
positive integer is returned.  Otherwise, if the conversion overflows, the
largest integer with the same sign as `a' is returned.
-------------------------------------------------------------------------------
*}
Function float64_to_int32(a: float64): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is less than
or equal to the corresponding value `b', and 0 otherwise.  The comparison
is performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_le( a: float32rec; b : float32rec ):flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the square root of the single-precision floating-point value `a'.
The operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_sqrt(a: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the remainder of the single-precision floating-point value `a'
with respect to the corresponding value `b'.  The operation is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of dividing the single-precision floating-point value `a'
by the corresponding value `b'.  The operation is performed according to the
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of multiplying the single-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of subtracting the single-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_sub( a: float32rec ; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of adding the single-precision floating-point values `a'
and `b'.  The operation is performed according to the IEC/IEEE Standard for
Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Rounds the single-precision floating-point value `a' to an integer,
and returns the result as a single-precision floating-point value.  The
operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_round_to_int( a: float32rec): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the double-precision floating-point format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_to_float64( a : float32rec) : Float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic, except that the conversion is always rounded toward zero.
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
the conversion overflows, the largest integer with the same sign as `a' is
returned.
-------------------------------------------------------------------------------
*}
Function float32_to_int32_round_to_zero( a: Float32rec ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic---which means in particular that the conversion is rounded
according to the current rounding mode.  If `a' is a NaN, the largest
positive integer is returned.  Otherwise, if the conversion overflows, the
largest integer with the same sign as `a' is returned.
-------------------------------------------------------------------------------
*}
Function float32_to_int32( a : float32rec) : int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the 32-bit two's complement integer `a' to
the double-precision floating-point format.  The conversion is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function int32_to_float64( a: int32) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
{*
-------------------------------------------------------------------------------
Returns the result of converting the 32-bit two's complement integer `a' to
the single-precision floating-point format.  The conversion is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the double-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}
Function int64_to_float64( a: int64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
Function qword_to_float64( a: qword ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the single-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}
Function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
Function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}

// +++
function float32_to_int64( a: float32 ): int64;
function float32_to_int64_round_to_zero( a: float32 ): int64;
function float32_eq_signaling( a: float32; b: float32) : flag;
function float32_le_quiet( a: float32 ; b : float32 ): flag;
function float32_lt_quiet( a: float32 ; b: float32 ): flag;
function float32_is_signaling_nan( a : float32  ): flag;
function float32_is_nan( a : float32 ): flag;
function float64_to_int64( a: float64 ): int64;
function float64_to_int64_round_to_zero( a: float64 ): int64;
function float64_eq_signaling( a: float64; b: float64): flag;
function float64_le_quiet(a: float64 ; b: float64 ): flag;
function float64_lt_quiet(a: float64; b: float64 ): Flag;
function float64_is_signaling_nan( a : float64 ): flag;
function float64_is_nan( a : float64 ): flag;
// ===

{$ifdef FPC_SOFTFLOAT_FLOATX80}
{*----------------------------------------------------------------------------
| Extended double-precision rounding precision
*----------------------------------------------------------------------------*}
var // threadvar!?
    floatx80_rounding_precision : int8 = 80;

function int32_to_floatx80( a: int32 ): floatx80;
function int64_to_floatx80( a: int64 ): floatx80;
function qword_to_floatx80( a: qword ): floatx80;
function float32_to_floatx80( a: float32 ): floatx80;
function float64_to_floatx80( a: float64 ): floatx80;
function floatx80_to_int32( a: floatx80 ): int32;
function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
function floatx80_to_int64( a: floatx80 ): int64;
function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
function floatx80_to_float32( a: floatx80 ): float32;
function floatx80_to_float64( a: floatx80 ): float64;
{$ifdef FPC_SOFTFLOAT_FLOAT128}
function floatx80_to_float128( a: floatx80 ): float128;
{$endif FPC_SOFTFLOAT_FLOAT128}
function floatx80_round_to_int( a: floatx80 ): floatx80;
function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
function floatx80_sqrt( a: floatx80 ): floatx80;
function floatx80_eq( a: floatx80; b: floatx80 ): flag;
function floatx80_le( a: floatx80; b: floatx80 ): flag;
function floatx80_lt( a: floatx80; b: floatx80 ): flag;
function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
function floatx80_is_signaling_nan( a: floatx80 ): flag;
function floatx80_is_nan(a : floatx80 ): flag;
{$endif FPC_SOFTFLOAT_FLOATX80}

{$ifdef FPC_SOFTFLOAT_FLOAT128}
function int32_to_float128( a: int32 ): float128;
function int64_to_float128( a: int64 ): float128;
function qword_to_float128( a: qword ): float128;
function float32_to_float128( a: float32 ): float128;
function float128_is_nan( a : float128): flag;
function float128_is_signaling_nan( a : float128): flag;
function float128_to_int32(a: float128): int32;
function float128_to_int32_round_to_zero(a: float128): int32;
function float128_to_int64(a: float128): int64;
function float128_to_int64_round_to_zero(a: float128): int64;
function float128_to_float32(a: float128): float32;
function float128_to_float64(a: float128): float64;
function float64_to_float128( a : float64) : float128;
{$ifdef FPC_SOFTFLOAT_FLOATX80}
function float128_to_floatx80(a: float128): floatx80;
{$endif FPC_SOFTFLOAT_FLOATX80}
function float128_round_to_int(a: float128): float128;
function float128_add(a: float128; b: float128): float128;
function float128_sub(a: float128; b: float128): float128;
function float128_mul(a: float128; b: float128): float128;
function float128_div(a: float128; b: float128): float128;
function float128_rem(a: float128; b: float128): float128;
function float128_sqrt(a: float128): float128;
function float128_eq(a: float128; b: float128): flag;
function float128_le(a: float128; b: float128): flag;
function float128_lt(a: float128; b: float128): flag;
function float128_eq_signaling(a: float128; b: float128): flag;
function float128_le_quiet(a: float128; b: float128): flag;
function float128_lt_quiet(a: float128; b: float128): flag;
{$endif FPC_SOFTFLOAT_FLOAT128}

CONST
{-------------------------------------------------------------------------------
Software IEC/IEEE floating-point underflow tininess-detection mode.
-------------------------------------------------------------------------------
*}
    float_tininess_after_rounding  = 0;
    float_tininess_before_rounding = 1;

{*
-------------------------------------------------------------------------------
Underflow tininess-detection mode, statically initialized to default value.
(The declaration in `softfloat.h' must match the `int8' type here.)
-------------------------------------------------------------------------------
*}

var // threadvar!?
    softfloat_detect_tininess: int8 = float_tininess_after_rounding;

{$endif  not(defined(fpc_softfpu_implementation))}

{$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
implementation
{$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}


{$if not(defined(fpc_softfpu_interface))}

{$ifdef FPC}
  { disable range and overflow checking explicitly }
  { This might be more essential for x80 and 128-bit
    floating point types and could, maybe be
    restricted to code handle floatx80 and float128 }
  {$push}
  {$R-}
  {$Q-}
{$endif FPC}

(*****************************************************************************)
(*----------------------------------------------------------------------------*)
(* Primitive arithmetic functions, including multi-word arithmetic, and       *)
(* division and square root approximations.  (Can be specialized to target if *)
(* desired.)                                                                  *)
(* ---------------------------------------------------------------------------*)
(*****************************************************************************)

{ This procedure serves as a single access point to softfloat_exception_flags.
  It also helps to reduce code size a bit because softfloat_exception_flags is
  a threadvar. }
procedure set_inexact_flag;
begin
    include(softfloat_exception_flags,float_flag_inexact);
end;

{*----------------------------------------------------------------------------
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
| and 7, and returns the properly rounded 32-bit integer corresponding to the
| input.  If `zSign' is 1, the input is negated before being converted to an
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
| is simply rounded to an integer, with the inexact exception raised if the
| input cannot be represented exactly as an integer.  However, if the fixed-
| point input is too large, the invalid exception is raised and the largest
| positive or negative integer is returned.
*----------------------------------------------------------------------------*}

function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
var
    roundingMode: TFPURoundingMode;
    roundNearestEven: boolean;
    roundIncrement, roundBits: int8;
    z: int32;
begin
    roundingMode := softfloat_rounding_mode;
    roundNearestEven := (roundingMode = float_round_nearest_even);
    roundIncrement := $40;
    if not roundNearestEven then
    begin
        if ( roundingMode = float_round_to_zero ) then
        begin
            roundIncrement := 0;
        end
        else begin
            roundIncrement := $7F;
            if ( zSign<>0 ) then
            begin
                if ( roundingMode = float_round_up ) then
                  roundIncrement := 0;
            end
            else begin
                if ( roundingMode = float_round_down ) then
                  roundIncrement := 0;
            end;
        end;
    end;
    roundBits := lo(absZ) and $7F;
    absZ := ( absZ + roundIncrement ) shr 7;
    absZ := absZ and not( bits64( ord( ( roundBits xor  $40 ) = 0 ) and ord(roundNearestEven) ));
    z := absZ;
    if ( zSign<>0 ) then
      z := - z;
    if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor  zSign ) ) )<>0 then
    begin
        float_raise( float_flag_invalid );
        if zSign<>0 then
          result:=sbits32($80000000)
        else
          result:=$7FFFFFFF;
        exit;
    end;
    if ( roundBits<>0 ) then
      set_inexact_flag;
    result:=z;
end;

{*----------------------------------------------------------------------------
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
| `absZ1', with binary point between bits 63 and 64 (between the input words),
| and returns the properly rounded 64-bit integer corresponding to the input.
| If `zSign' is 1, the input is negated before being converted to an integer.
| Ordinarily, the fixed-point input is simply rounded to an integer, with
| the inexact exception raised if the input cannot be represented exactly as
| an integer.  However, if the fixed-point input is too large, the invalid
| exception is raised and the largest positive or negative integer is
| returned.
*----------------------------------------------------------------------------*}

function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
var
    roundingMode: TFPURoundingMode;
    roundNearestEven, increment: flag;
    z: int64;
label
    overflow;
begin
    roundingMode := softfloat_rounding_mode;
    roundNearestEven := ord( roundingMode = float_round_nearest_even );
    increment := ord( sbits64(absZ1) < 0 );
    if ( roundNearestEven=0 ) then
    begin
        if ( roundingMode = float_round_to_zero ) then
        begin
            increment := 0;
        end
        else begin
            if ( zSign<>0 ) then
            begin
                increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
            end
            else begin
                increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
            end;
        end;
    end;
    if ( increment<>0 ) then
    begin
        inc(absZ0);
        if ( absZ0 = 0 ) then
          goto overflow;
        absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
    end;
    z := absZ0;
    if ( zSign<>0 ) then
      z := - z;
    if ( (z<>0) and (( ord( z < 0 ) xor  zSign )<>0) ) then
    begin
 overflow:
        float_raise( float_flag_invalid );
        if zSign<>0 then
          result:=int64($8000000000000000)
        else
          result:=int64($7FFFFFFFFFFFFFFF);
        exit;
    end;
    if ( absZ1<>0 ) then
      set_inexact_flag;
    result:=z;
end;

{*
-------------------------------------------------------------------------------
Shifts `a' right by the number of bits given in `count'.  If any nonzero
bits are shifted off, they are ``jammed'' into the least significant bit of
the result by setting the least significant bit to 1.  The value of `count'
can be arbitrarily large; in particular, if `count' is greater than 32, the
result will be either 0 or 1, depending on whether `a' is zero or nonzero.
The result is stored in the location pointed to by `zPtr'.
-------------------------------------------------------------------------------
*}
Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
var
  z: Bits32;
Begin
    if ( count = 0 ) then
        z := a
   else
    if ( count < 32 ) then
    Begin
        z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
    End
   else
    Begin
        z := bits32( a <> 0 );
    End;
    zPtr := z;
End;

{*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
| number of bits given in `count'.  Any bits shifted off are lost.  The value
| of `count' can be arbitrarily large; in particular, if `count' is greater
| than 128, the result will be 0.  The result is broken into two 64-bit pieces
| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*}

procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
var
    z0, z1: bits64;
    negCount: int8;
begin
    negCount := ( - count ) and 63;

    if ( count = 0 ) then
    begin
        z1 := a1;
        z0 := a0;
    end
    else if ( count < 64 ) then
    begin
        z1 := ( a0 shl negCount ) or ( a1 shr count );
        z0 := a0 shr count;
    end
    else
    begin
         if ( count < 128 ) then
          z1 := a0 shr ( count and 63 )
        else
          z1 := 0;
        z0 := 0;
    end;
    z1Ptr := z1;
    z0Ptr := z0;
end;


{*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
| number of bits given in `count'.  If any nonzero bits are shifted off, they
| are ``jammed'' into the least significant bit of the result by setting the
| least significant bit to 1.  The value of `count' can be arbitrarily large;
| in particular, if `count' is greater than 128, the result will be either
| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
| nonzero.  The result is broken into two 64-bit pieces which are stored at
| the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*}

procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
var
    z0,z1 : bits64;
    negCount : int8;
begin
    negCount := ( - count ) and 63;

    if ( count = 0 ) then begin
        z1 := a1;
        z0 := a0;
    end
    else if ( count < 64 ) then begin
        z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
        z0 := a0 shr count;
    end
    else begin
        if ( count = 64 ) then begin
            z1 := a0 or ord( a1 <> 0 );
        end
        else if ( count < 128 ) then begin
            z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
        end
        else begin
            z1 := ord( ( a0 or a1 ) <> 0 );
        end;
        z0 := 0;
    end;
    z1Ptr := z1;
    z0Ptr := z0;
end;

{*
-------------------------------------------------------------------------------
Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
number of bits given in `count'.  Any bits shifted off are lost.  The value
of `count' can be arbitrarily large; in particular, if `count' is greater
than 64, the result will be 0.  The result is broken into two 32-bit pieces
which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 shift64Right(
     a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
Var
  z0, z1: bits32;
  negCount : int8;
Begin
    negCount := ( - count ) AND 31;

    if ( count = 0 ) then
    Begin
        z1 := a1;
        z0 := a0;
    End
    else if ( count < 32 ) then
    Begin
        z1 := ( a0 shl negCount ) OR ( a1 shr count );
        z0 := a0 shr count;
    End
   else
    Begin
        if (count < 64) then
          z1 := ( a0 shr ( count AND 31 ) )
        else
          z1 := 0;
        z0 := 0;
    End;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*
-------------------------------------------------------------------------------
Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
number of bits given in `count'.  If any nonzero bits are shifted off, they
are ``jammed'' into the least significant bit of the result by setting the
least significant bit to 1.  The value of `count' can be arbitrarily large;
in particular, if `count' is greater than 64, the result will be either 0
or 1, depending on whether the concatenation of `a0' and `a1' is zero or
nonzero.  The result is broken into two 32-bit pieces which are stored at
the locations pointed to by `z0Ptr' and `z1Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 shift64RightJamming(
     a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
VAR
    z0, z1 : bits32;
    negCount : int8;
Begin
    negCount := ( - count ) AND 31;

    if ( count = 0 ) then
    Begin
        z1 := a1;
        z0 := a0;
    End
   else
    if ( count < 32 ) then
    Begin
        z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
        z0 := a0 shr count;
    End
   else
    Begin
        if ( count = 32 ) then
        Begin
            z1 := a0 OR bits32( a1 <> 0 );
        End
       else
        if ( count < 64 ) Then
        Begin
            z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
        End
       else
        Begin
            z1 := bits32( ( a0 OR a1 ) <> 0 );
        End;
        z0 := 0;
    End;
    z1Ptr := z1;
    z0Ptr := z0;
End;


{*----------------------------------------------------------------------------
| Shifts `a' right by the number of bits given in `count'.  If any nonzero
| bits are shifted off, they are ``jammed'' into the least significant bit of
| the result by setting the least significant bit to 1.  The value of `count'
| can be arbitrarily large; in particular, if `count' is greater than 64, the
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
| The result is stored in the location pointed to by `zPtr'.
*----------------------------------------------------------------------------*}

procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
var
    z: bits64;
begin
    if ( count = 0 ) then
    begin
        z := a;
    end
    else if ( count < 64 ) then
    begin
        z := ( a shr count ) or ord( ( a  shl ( ( - count ) and 63 ) ) <> 0 );
    end
    else
    begin
        z := ord( a <> 0 );
    end;
    zPtr := z;
end;

{$if not defined(shift64ExtraRightJamming)}
procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
overload;
forward;
{$endif}

{*
-------------------------------------------------------------------------------
Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
by 32 _plus_ the number of bits given in `count'.  The shifted result is
at most 64 nonzero bits; these are broken into two 32-bit pieces which are
stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
off form a third 32-bit result as follows:  The _last_ bit shifted off is
the most-significant bit of the extra result, and the other 31 bits of the
extra result are all zero if and only if _all_but_the_last_ bits shifted off
were all zero.  This extra result is stored in the location pointed to by
`z2Ptr'.  The value of `count' can be arbitrarily large.
    (This routine makes more sense if `a0', `a1', and `a2' are considered
to form a fixed-point value with binary point between `a1' and `a2'.  This
fixed-point value is shifted right by the number of bits given in `count',
and the integer part of the result is returned at the locations pointed to
by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
corrupted as described above, and is returned at the location pointed to by
`z2Ptr'.)
-------------------------------------------------------------------------------
}
Procedure
 shift64ExtraRightJamming(
     a0: bits32;
     a1: bits32;
     a2: bits32;
     count: int16;
     VAR z0Ptr: bits32;
     VAR z1Ptr: bits32;
     VAR z2Ptr: bits32
 ); overload;
Var
    z0, z1, z2: bits32;
    negCount : int8;
Begin
    negCount := ( - count ) AND 31;

    if ( count = 0 ) then
    Begin
        z2 := a2;
        z1 := a1;
        z0 := a0;
    End
   else
    Begin
        if ( count < 32 ) Then
        Begin
            z2 := a1 shl negCount;
            z1 := ( a0 shl negCount ) OR ( a1 shr count );
            z0 := a0 shr count;
        End
       else
        Begin
            if ( count = 32 ) then
            Begin
                z2 := a1;
                z1 := a0;
            End
           else
            Begin
                a2 := a2 or a1;
                if ( count < 64 ) then
                Begin
                    z2 := a0 shl negCount;
                    z1 := a0 shr ( count AND 31 );
                End
               else
                Begin
                    if count = 64 then
                       z2 := a0
                    else
                       z2 := bits32(a0 <> 0);
                    z1 := 0;
                End;
            End;
            z0 := 0;
        End;
        z2 := z2 or bits32( a2 <> 0 );
    End;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*
-------------------------------------------------------------------------------
Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
number of bits given in `count'.  Any bits shifted off are lost.  The value
of `count' must be less than 32.  The result is broken into two 32-bit
pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 shortShift64Left(
     a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
Begin

    z1Ptr := a1 shl count;
    if count = 0 then
      z0Ptr := a0
    else
      z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
End;

{*
-------------------------------------------------------------------------------
Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
by the number of bits given in `count'.  Any bits shifted off are lost.
The value of `count' must be less than 32.  The result is broken into three
32-bit pieces which are stored at the locations pointed to by `z0Ptr',
`z1Ptr', and `z2Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 shortShift96Left(
     a0: bits32;
     a1: bits32;
     a2: bits32;
     count: int16;
     VAR z0Ptr: bits32;
     VAR z1Ptr: bits32;
     VAR z2Ptr: bits32
 );
Var
    z0, z1, z2: bits32;
    negCount: int8;
Begin
    z2 := a2 shl count;
    z1 := a1 shl count;
    z0 := a0 shl count;
    if ( 0 < count ) then
    Begin
        negCount := ( ( - count ) AND 31 );
        z1 := z1 or (a2 shr negCount);
        z0 := z0 or (a1 shr negCount);
    End;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
| number of bits given in `count'.  Any bits shifted off are lost.  The value
| of `count' must be less than 64.  The result is broken into two 64-bit
| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*}

procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
begin
    z1Ptr := a1 shl count;
    if count=0 then
      z0Ptr:=a0
    else
      z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
end;

{*
-------------------------------------------------------------------------------
Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
value formed by concatenating `b0' and `b1'.  Addition is modulo 2^64, so
any carry out is lost.  The result is broken into two 32-bit pieces which
are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 add64(
     a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
Var
    z1: bits32;
Begin
    z1 := a1 + b1;
    z1Ptr := z1;
    z0Ptr := a0 + b0 + bits32( z1 < a1 );
End;

{*
-------------------------------------------------------------------------------
Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
96-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
modulo 2^96, so any carry out is lost.  The result is broken into three
32-bit pieces which are stored at the locations pointed to by `z0Ptr',
`z1Ptr', and `z2Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 add96(
     a0: bits32;
     a1: bits32;
     a2: bits32;
     b0: bits32;
     b1: bits32;
     b2: bits32;
     VAR z0Ptr: bits32;
     VAR z1Ptr: bits32;
     VAR z2Ptr: bits32
 );
var
    z0, z1, z2: bits32;
    carry0, carry1: int8;
Begin
    z2 := a2 + b2;
    carry1 := int8( z2 < a2 );
    z1 := a1 + b1;
    carry0 := int8( z1 < a1 );
    z0 := a0 + b0;
    z1 := z1 + carry1;
    z0 := z0 + bits32( z1 < carry1 );
    z0 := z0 + carry0;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*----------------------------------------------------------------------------
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
| by the number of bits given in `count'.  Any bits shifted off are lost.
| The value of `count' must be less than 64.  The result is broken into three
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
| `z1Ptr', and `z2Ptr'.
*----------------------------------------------------------------------------*}

procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
var
    z0, z1, z2 : bits64;
    negCount : int8;
begin
    z2 := a2 shl count;
    z1 := a1 shl count;
    z0 := a0 shl count;
    if ( 0 < count ) then
    begin
        negCount := ( ( - count ) and 63 );
        z1 := z1 or (a2 shr negCount);
        z0 := z0 or (a1 shr negCount);
    end;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
end;

{*----------------------------------------------------------------------------
| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
| any carry out is lost.  The result is broken into two 64-bit pieces which
| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*}

procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
var
    z1 : bits64;
begin
    z1 := a1 + b1;
    z1Ptr := z1;
    z0Ptr := a0 + b0 + ord( z1 < a1 );
end;

{*----------------------------------------------------------------------------
| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
| modulo 2^192, so any carry out is lost.  The result is broken into three
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
| `z1Ptr', and `z2Ptr'.
*----------------------------------------------------------------------------*}

procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
var
    z0, z1, z2 : bits64;
    carry0, carry1 : int8;
begin
    z2 := a2 + b2;
    carry1 := ord( z2 < a2 );
    z1 := a1 + b1;
    carry0 := ord( z1 < a1 );
    z0 := a0 + b0;
    inc(z1, carry1);
    inc(z0, ord( z1 < carry1 ));
    inc(z0, carry0);
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
end;

{*
-------------------------------------------------------------------------------
Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
64-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
2^64, so any borrow out (carry out) is lost.  The result is broken into two
32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
`z1Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 sub64(
     a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
Begin
    z1Ptr := a1 - b1;
    z0Ptr := a0 - b0 - bits32( a1 < b1 );
End;

{*
-------------------------------------------------------------------------------
Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
the 96-bit value formed by concatenating `a0', `a1', and `a2'.  Subtraction
is modulo 2^96, so any borrow out (carry out) is lost.  The result is broken
into three 32-bit pieces which are stored at the locations pointed to by
`z0Ptr', `z1Ptr', and `z2Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 sub96(
     a0:bits32;
     a1:bits32;
     a2:bits32;
     b0:bits32;
     b1:bits32;
     b2:bits32;
     VAR z0Ptr:bits32;
     VAR z1Ptr:bits32;
     VAR z2Ptr:bits32
 );
Var
    z0, z1, z2: bits32;
    borrow0, borrow1: int8;
Begin
    z2 := a2 - b2;
    borrow1 := int8( a2 < b2 );
    z1 := a1 - b1;
    borrow0 := int8( a1 < b1 );
    z0 := a0 - b0;
    z0 := z0 - bits32( z1 < borrow1 );
    z1 := z1 - borrow1;
    z0 := z0 -borrow0;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*----------------------------------------------------------------------------
| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
| `z1Ptr'.
*----------------------------------------------------------------------------*}

procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
begin
    z1Ptr := a1 - b1;
    z0Ptr := a0 - b0 - ord( a1 < b1 );
end;


{*----------------------------------------------------------------------------
| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
| result is broken into three 64-bit pieces which are stored at the locations
| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
*----------------------------------------------------------------------------*}

procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
var
    z0, z1, z2 : bits64;
    borrow0, borrow1 : int8;
begin
    z2 := a2 - b2;
    borrow1 := ord( a2 < b2 );
    z1 := a1 - b1;
    borrow0 := ord( a1 < b1 );
    z0 := a0 - b0;
    dec(z0, ord( z1 < borrow1 ));
    dec(z1, borrow1);
    dec(z0, borrow0);
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
end;

{*
-------------------------------------------------------------------------------
Multiplies `a' by `b' to obtain a 64-bit product.  The product is broken
into two 32-bit pieces which are stored at the locations pointed to by
`z0Ptr' and `z1Ptr'.
-------------------------------------------------------------------------------
*}
{$IFDEF SOFTFPU_COMPILER_MUL32TO64}
Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
var
  tmp: qword;
begin
  tmp:=qword(a) * b;
  z0ptr:=hi(tmp);
  z1ptr:=lo(tmp);
end;
{$ELSE}
Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
:bits32 );
Var
    aHigh, aLow, bHigh, bLow: bits16;
    z0, zMiddleA, zMiddleB, z1: bits32;
Begin
    aLow := bits16(a);
    aHigh := a shr 16;
    bLow := bits16(b);
    bHigh := b shr 16;
    z1 := ( bits32( aLow) ) * bLow;
    zMiddleA := ( bits32 (aLow) ) * bHigh;
    zMiddleB := ( bits32 (aHigh) ) * bLow;
    z0 := ( bits32 (aHigh) ) * bHigh;
    zMiddleA := zMiddleA + zMiddleB;
    z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
    zMiddleA := zmiddleA shl 16;
    z1 := z1 + zMiddleA;
    z0 := z0 + bits32( z1 < zMiddleA );
    z1Ptr := z1;
    z0Ptr := z0;
End;
{$ENDIF}

{*
-------------------------------------------------------------------------------
Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
to obtain a 96-bit product.  The product is broken into three 32-bit pieces
which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
`z2Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 mul64By32To96(
     a0:bits32;
     a1:bits32;
     b:bits32;
     VAR z0Ptr:bits32;
     VAR z1Ptr:bits32;
     VAR z2Ptr:bits32
 );
Var
    z0, z1, z2, more1: bits32;
Begin
    mul32To64( a1, b, z1, z2 );
    mul32To64( a0, b, z0, more1 );
    add64( z0, more1, 0, z1, z0, z1 );
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*
-------------------------------------------------------------------------------
Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
product.  The product is broken into four 32-bit pieces which are stored at
the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
-------------------------------------------------------------------------------
*}
Procedure
 mul64To128(
     a0:bits32;
     a1:bits32;
     b0:bits32;
     b1:bits32;
     VAR z0Ptr:bits32;
     VAR z1Ptr:bits32;
     VAR z2Ptr:bits32;
     VAR z3Ptr:bits32
 );
Var
    z0, z1, z2, z3: bits32;
    more1, more2: bits32;
Begin

    mul32To64( a1, b1, z2, z3 );
    mul32To64( a1, b0, z1, more2 );
    add64( z1, more2, 0, z2, z1, z2 );
    mul32To64( a0, b0, z0, more1 );
    add64( z0, more1, 0, z1, z0, z1 );
    mul32To64( a0, b1, more1, more2 );
    add64( more1, more2, 0, z2, more1, z2 );
    add64( z0, z1, 0, more1, z0, z1 );
    z3Ptr := z3;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
End;

{*----------------------------------------------------------------------------
| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
| into two 64-bit pieces which are stored at the locations pointed to by
| `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*}

procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
var
    aHigh, aLow, bHigh, bLow : bits32;
    z0, zMiddleA, zMiddleB, z1 : bits64;
begin
    aLow := a;
    aHigh := a shr 32;
    bLow := b;
    bHigh := b shr 32;
    z1 := ( bits64(aLow) ) * bLow;
    zMiddleA := ( bits64( aLow )) * bHigh;
    zMiddleB := ( bits64( aHigh )) * bLow;
    z0 := ( bits64(aHigh) ) * bHigh;
    inc(zMiddleA, zMiddleB);
    inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
    zMiddleA := zMiddleA shl 32;
    inc(z1, zMiddleA);
    inc(z0, ord( z1 < zMiddleA ));
    z1Ptr := z1;
    z0Ptr := z0;
end;

{*----------------------------------------------------------------------------
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
| product.  The product is broken into four 64-bit pieces which are stored at
| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
*----------------------------------------------------------------------------*}

procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
var
    z0,z1,z2,z3,more1,more2 : bits64;
begin
    mul64To128( a1, b1, z2, z3 );
    mul64To128( a1, b0, z1, more2 );
    add128( z1, more2, 0, z2, z1, z2 );
    mul64To128( a0, b0, z0, more1 );
    add128( z0, more1, 0, z1, z0, z1 );
    mul64To128( a0, b1, more1, more2 );
    add128( more1, more2, 0, z2, more1, z2 );
    add128( z0, z1, 0, more1, z0, z1 );
    z3Ptr := z3;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
end;


{*----------------------------------------------------------------------------
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
| `z2Ptr'.
*----------------------------------------------------------------------------*}
procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
var
    z0, z1, z2, more1 : bits64;
begin
    mul64To128( a1, b, z1, z2 );
    mul64To128( a0, b, z0, more1 );
    add128( z0, more1, 0, z1, z0, z1 );
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;
end;

{*----------------------------------------------------------------------------
| Returns an approximation to the 64-bit integer quotient obtained by dividing
| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
| toward zero, the approximation returned lies between q and q + 2 inclusive.
| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
| unsigned integer is returned.
*----------------------------------------------------------------------------*}

Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
var
    b0, b1, rem0, rem1, term0, term1, z : bits64;
begin
    if ( b <= a0 ) then
      begin
        result:=qword( $FFFFFFFFFFFFFFFF );
        exit;
      end;
    b0 := b shr 32;
    if ( b0 shl 32 <= a0 ) then
      z:=qword( $FFFFFFFF00000000 )
    else
      z:=( a0 div b0 ) shl 32;
    mul64To128( b, z, term0, term1 );
    sub128( a0, a1, term0, term1, rem0, rem1 );
    while ( ( sbits64(rem0) ) < 0 ) do begin
        dec(z,qword( $100000000 ));
        b1 := b shl 32;
        add128( rem0, rem1, b0, b1, rem0, rem1 );
    end;
    rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
    if ( b0 shl 32 <= rem0 ) then
      z:=z or $FFFFFFFF
    else
      z:=z or rem0 div b0;
    result:=z;
end;


{*
-------------------------------------------------------------------------------
Returns an approximation to the 32-bit integer quotient obtained by dividing
`b' into the 64-bit value formed by concatenating `a0' and `a1'.  The
divisor `b' must be at least 2^31.  If q is the exact quotient truncated
toward zero, the approximation returned lies between q and q + 2 inclusive.
If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
unsigned integer is returned.
-------------------------------------------------------------------------------
*}
Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
Var
    b0, b1: bits32;
    rem0, rem1, term0, term1: bits32;
    z: bits32;
Begin
    if ( b <= a0 ) then
    Begin
       estimateDiv64To32 := $FFFFFFFF;
       exit;
    End;
    b0 := b shr 16;
    if ( b0 shl 16 <= a0 ) then
       z:= $FFFF0000
     else
       z:= ( a0 div b0 ) shl 16;
    mul32To64( b, z, term0, term1 );
    sub64( a0, a1, term0, term1, rem0, rem1 );
    while ( ( sbits32 (rem0) ) < 0 ) do
    Begin
        z := z - $10000;
        b1 := b shl 16;
        add64( rem0, rem1, b0, b1, rem0, rem1 );
    End;
    rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
    if ( b0 shl 16 <= rem0 ) then
      z := z or $FFFF
    else
      z := z or (rem0 div b0);
    estimateDiv64To32 := z;

End;

{*
-------------------------------------------------------------------------------
Returns an approximation to the square root of the 32-bit significand given
by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
`aExp' (the least significant bit) is 1, the integer returned approximates
2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
case, the approximation returned lies strictly within +/-2 of the exact
value.
-------------------------------------------------------------------------------
*}
Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
    const sqrtOddAdjustments: array[0..15] of bits16 = (
        $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
        $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
    );
    const sqrtEvenAdjustments: array[0..15] of bits16 = (
        $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
        $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
    );
Var
    index: int8;
    z: bits32;
Begin

    index := ( a shr 27 ) AND 15;
    if ( aExp AND 1 ) <> 0  then
    Begin
        z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
        z := ( ( a div z ) shl 14 ) + ( z shl 15 );
        a := a shr 1;
    End
    else
    Begin
        z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
        z := a div z + z;
        if ( $20000 <= z ) then
          z := $FFFF8000
        else
          z := ( z shl 15 );
        if ( z <= a ) then
        Begin
           estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
           exit;
        End;
    End;
    estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
End;

{*
-------------------------------------------------------------------------------
Returns the number of leading 0 bits before the most-significant 1 bit of
`a'.  If `a' is zero, 32 is returned.
-------------------------------------------------------------------------------
*}
Function countLeadingZeros32( a:bits32 ): int8;

    const countLeadingZerosHigh:array[0..255] of int8 = (
        8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    );
Var
    shiftCount: int8;
Begin

    shiftCount := 0;
    if ( a < $10000 ) then
    Begin
        shiftCount := shiftcount + 16;
        a := a shl 16;
    End;
    if ( a < $1000000 ) then
    Begin
        shiftCount := shiftcount + 8;
        a := a shl 8;
    end;
    shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
    countLeadingZeros32:= shiftCount;
End;

{*----------------------------------------------------------------------------
| Returns the number of leading 0 bits before the most-significant 1 bit of
| `a'.  If `a' is zero, 64 is returned.
*----------------------------------------------------------------------------*}

function countLeadingZeros64( a : bits64): int8;
var
 shiftcount : int8;
Begin
    shiftCount := 0;
    if ( a <  bits64(bits64(1)  shl 32 )) then
        shiftCount := shiftcount + 32
    else
        a := a shr 32;
    shiftCount := shiftCount + countLeadingZeros32( a );
    countLeadingZeros64:= shiftCount;
End;


{*
-------------------------------------------------------------------------------
Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
Otherwise, returns 0.
-------------------------------------------------------------------------------
*}
Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
Begin

    le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );

End;

{*
-------------------------------------------------------------------------------
Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
than the 64-bit value formed by concatenating `b0' and `b1'.  Otherwise,
returns 0.
-------------------------------------------------------------------------------
*}
Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
Begin
    lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
End;

const
  float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);


(*****************************************************************************)
(*                      End Low-Level arithmetic                             *)
(*****************************************************************************)

{*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
| returns 0.
*----------------------------------------------------------------------------*}

function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
begin
    result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
end;


{*
-------------------------------------------------------------------------------
Functions and definitions to determine:  (1) whether tininess for underflow
is detected before or after rounding by default, (2) what (if anything)
happens when exceptions are raised, (3) how signaling NaNs are distinguished
from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
are propagated from function inputs to output.  These details are ENDIAN
specific
-------------------------------------------------------------------------------
*}
{$IFDEF ENDIAN_LITTLE}
{*
-------------------------------------------------------------------------------
Internal canonical NaN format.
-------------------------------------------------------------------------------
*}
TYPE
 commonNaNT = record
   high, low : bits32;
   sign: flag;
 end;
{*
-------------------------------------------------------------------------------
The pattern for a default generated single-precision NaN.
-------------------------------------------------------------------------------
*}
const float32_default_nan = $FFC00000;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is a NaN;
otherwise returns 0.
-------------------------------------------------------------------------------
*}
Function float32_is_nan( a : float32 ): flag;
Begin

    float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );

End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is a signaling
NaN; otherwise returns 0.
-------------------------------------------------------------------------------
*}
Function float32_is_signaling_nan( a : float32  ): flag;
Begin

    float32_is_signaling_nan := flag
      (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));

End;

{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point NaN
`a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
exception is raised.
-------------------------------------------------------------------------------
*}
function float32ToCommonNaN(a: float32) : commonNaNT;
var
    z : commonNaNT ;
Begin
    if ( float32_is_signaling_nan( a ) <> 0) then
       float_raise( float_flag_invalid );
    z.sign := a shr 31;
    z.low := 0;
    z.high := a shl 9;
    result := z;
End;

{*
-------------------------------------------------------------------------------
Returns the result of converting the canonical NaN `a' to the single-
precision floating-point format.
-------------------------------------------------------------------------------
*}
Function commonNaNToFloat32( a : commonNaNT ): float32;
Begin
    commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
End;

{*
-------------------------------------------------------------------------------
Takes two single-precision floating-point values `a' and `b', one of which
is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
signaling NaN, the invalid exception is raised.
-------------------------------------------------------------------------------
*}
Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
Var
    aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
label returnLargerSignificand;
Begin
    aIsNaN := float32_is_nan( a );
    aIsSignalingNaN := float32_is_signaling_nan( a );
    bIsNaN := float32_is_nan( b );
    bIsSignalingNaN := float32_is_signaling_nan( b );
    a := a or $00400000;
    b := b or $00400000;
    if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
        float_raise( float_flag_invalid );
    if ( aIsSignalingNaN )<> 0  then
    Begin
        if ( bIsSignalingNaN ) <>  0 then
          goto returnLargerSignificand;
        if bIsNan <> 0 then
          propagateFloat32NaN := b
        else
          propagateFloat32NaN := a;
        exit;
    End
    else if ( aIsNaN <> 0) then
    Begin
        if ( bIsSignalingNaN or not bIsNaN )<> 0 then
        Begin
           propagateFloat32NaN := a;
           exit;
        End;
 returnLargerSignificand:
        if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
        Begin
           propagateFloat32NaN := b;
           exit;
        End;
        if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
        Begin
           propagateFloat32NaN :=  a;
        End;
        if a < b then
          propagateFloat32NaN := a
        else
          propagateFloat32NaN := b;
        exit;
    End
    else
    Begin
        propagateFloat32NaN := b;
        exit;
    End;

End;

{*
-------------------------------------------------------------------------------
The pattern for a default generated double-precision NaN.  The `high' and
`low' values hold the most- and least-significant bits, respectively.
-------------------------------------------------------------------------------
*}
const
    float64_default_nan_high = $FFF80000;
    float64_default_nan_low  = $00000000;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is a NaN;
otherwise returns 0.
-------------------------------------------------------------------------------
*}
Function float64_is_nan( a : float64 ) : flag;
Begin

    float64_is_nan :=
           flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
        and (( a.low or ( a.high and $000FFFFF ) )<>0));

End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is a signaling
NaN; otherwise returns 0.
-------------------------------------------------------------------------------
*}
Function float64_is_signaling_nan( a : float64 ): flag;
Begin

    float64_is_signaling_nan :=
           flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
        and ( a.low or ( a.high and $0007FFFF ) );
End;

{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point NaN
`a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
exception is raised.
-------------------------------------------------------------------------------
*}
function float64ToCommonNaN( a : float64 ) : commonNaNT;
Var
    z : commonNaNT;
Begin
    if ( float64_is_signaling_nan( a )<>0 ) then
        float_raise( float_flag_invalid );
    z.sign := a.high shr 31;
    shortShift64Left( a.high, a.low, 12, z.high, z.low );
    result := z;

End;
{*
-------------------------------------------------------------------------------
Returns the result of converting the canonical NaN `a' to the double-
precision floating-point format.
-------------------------------------------------------------------------------
*}
function commonNaNToFloat64( a : commonNaNT) : float64;
Var
    z: float64;
Begin
    shift64Right( a.high, a.low, 12, z.high, z.low );
    z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
    result := z;
End;

{*
-------------------------------------------------------------------------------
Takes two double-precision floating-point values `a' and `b', one of which
is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
signaling NaN, the invalid exception is raised.
-------------------------------------------------------------------------------
*}
Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
Var
    aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
    label returnLargerSignificand;
Begin
    aIsNaN := float64_is_nan( a );
    aIsSignalingNaN := float64_is_signaling_nan( a );
    bIsNaN := float64_is_nan( b );
    bIsSignalingNaN := float64_is_signaling_nan( b );
    a.high := a.high or $00080000;
    b.high := b.high or $00080000;
    if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
        float_raise( float_flag_invalid );
    if ( aIsSignalingNaN )<>0 then
    Begin
        if ( bIsSignalingNaN )<>0 then
            goto returnLargerSignificand;
        if bIsNan <> 0 then
           c := b
        else
           c := a;
        exit;
    End
    else if ( aIsNaN )<> 0 then
    Begin
        if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
        Begin
          c := a;
           exit;
        End;
 returnLargerSignificand:
        if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
        Begin
           c := b;
           exit;
        End;
        if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
        Begin
           c := a;
           exit;
        End;
        if a.high < b.high then
         c := a
        else
         c := b;
        exit;
    End
    else
    Begin
        c := b;
        exit;
    End;
End;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
| otherwise returns 0.
*----------------------------------------------------------------------------*}

function float128_is_nan( a : float128): flag;
begin
    result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
        and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is a
| signaling NaN; otherwise returns 0.
*----------------------------------------------------------------------------*}

function float128_is_signaling_nan( a : float128): flag;
begin
    result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
        ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point NaN
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*}

function float128ToCommonNaN( a : float128): commonNaNT;
var
    z: commonNaNT;
    qhigh,qlow : qword;
begin
    if ( float128_is_signaling_nan( a )<>0)  then
      float_raise( float_flag_invalid );
    z.sign := a.high shr 63;
    shortShift128Left( a.high, a.low, 16, qhigh, qlow );
    z.high:=qhigh shr 32;
    z.low:=qhigh and $ffffffff;
    result:=z;
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the quadruple-
| precision floating-point format.
*----------------------------------------------------------------------------*}

function commonNaNToFloat128( a : commonNaNT): float128;
var
    z: float128;
begin
    shift128Right( a.high, a.low, 16, z.high, z.low );
    z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
    result:=z;
end;

{*----------------------------------------------------------------------------
| Takes two quadruple-precision floating-point values `a' and `b', one of
| which is a NaN, and returns the appropriate NaN result.  If either `a' or
| `b' is a signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*}

function propagateFloat128NaN( a: float128; b : float128): float128;
var
    aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
label
    returnLargerSignificand;
begin
    aIsNaN := float128_is_nan( a );
    aIsSignalingNaN := float128_is_signaling_nan( a );
    bIsNaN := float128_is_nan( b );
    bIsSignalingNaN := float128_is_signaling_nan( b );
    a.high := a.high or int64( $0000800000000000 );
    b.high := b.high or int64( $0000800000000000 );
    if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
       float_raise( float_flag_invalid );
    if ( aIsSignalingNaN )<>0 then
    begin
        if ( bIsSignalingNaN )<>0 then
          goto returnLargerSignificand;
        if bIsNaN<>0 then
          result := b
        else
          result := a;
        exit;
    end
    else if ( aIsNaN )<>0 then
    begin
        if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
          begin
          	result := a;
          	exit;
          end;
 returnLargerSignificand:
        if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
          begin
          	result := b;
          	exit;
          end;
        if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
          begin
          	result := a;
          	exit
          end;
        if ( a.high < b.high ) then
          result := a
        else
          result := b;
        exit;
    end
    else
    result:=b;
end;


{$ELSE}

{ Big endian code }
(*----------------------------------------------------------------------------
| Internal canonical NaN format.
*----------------------------------------------------------------------------*)
type
 commonNANT = record
  high, low : bits32;
  sign : flag;
 end;

(*----------------------------------------------------------------------------
| The pattern for a default generated single-precision NaN.
*----------------------------------------------------------------------------*)
const float32_default_nan = $7FFFFFFF;

(*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is a NaN;
| otherwise returns 0.
*----------------------------------------------------------------------------*)
function float32_is_nan(a:  float32): flag;
begin
    float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
end;

(*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is a signaling
| NaN; otherwise returns 0.
*----------------------------------------------------------------------------*)
function float32_is_signaling_nan(a: float32):flag;
 begin
   float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
 end;

(*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point NaN
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*)
function float32ToCommonNaN( a: float32) : commonNaNT;
 var
  z: commonNANT;
 begin
   if float32_is_signaling_nan(a)<>0 then
      float_raise(float_flag_invalid);
   z.sign := a shr 31;
   z.low := 0;
   z.high := a shl 9;
   result:=z;
 end;

(*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the single-
| precision floating-point format.
*----------------------------------------------------------------------------*)
function CommonNanToFloat32(a : CommonNaNT): float32;
 begin
    CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
 end;

(*----------------------------------------------------------------------------
| Takes two single-precision floating-point values `a' and `b', one of which
| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
| signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*)
function  propagateFloat32NaN( a: float32 ; b: float32): float32;
 var
  aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
 begin
    aIsNaN := float32_is_nan( a );
    aIsSignalingNaN := float32_is_signaling_nan( a );
    bIsNaN := float32_is_nan( b );
    bIsSignalingNaN := float32_is_signaling_nan( b );
    a := a or $00400000;
    b := b or $00400000;
    if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
       float_raise( float_flag_invalid );
    if bIsSignalingNaN<>0 then
        propagateFloat32Nan := b
    else if aIsSignalingNan<>0 then
        propagateFloat32Nan := a
    else if bIsNan<>0 then
        propagateFloat32Nan := b
    else
        propagateFloat32Nan := a;
 end;


(*----------------------------------------------------------------------------
| The pattern for a default generated double-precision NaN.  The `high' and
| `low' values hold the most- and least-significant bits, respectively.
*----------------------------------------------------------------------------*)
const
    float64_default_nan_high = $7FFFFFFF;
    float64_default_nan_low  = $FFFFFFFF;

(*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is a NaN;
| otherwise returns 0.
*----------------------------------------------------------------------------*)

function float64_is_nan(a: float64): flag;
 begin
    float64_is_nan := flag (
           ( $FFE00000 <= bits32 ( a.high shl 1 ) )
        and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
 end;

(*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is a signaling
| NaN; otherwise returns 0.
*----------------------------------------------------------------------------*)
function float64_is_signaling_nan( a:float64): flag;
 begin
    float64_is_signaling_nan := flag(
           ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
        and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));

 end;

(*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point NaN
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*)
function float64ToCommonNaN( a : float64)  : commonNaNT;
 var
   z : commonNaNT;
 begin
    if ( float64_is_signaling_nan( a )<>0 ) then
        float_raise( float_flag_invalid );
    z.sign := a.high shr 31;
    shortShift64Left( a.high, a.low, 12, z.high, z.low );
    result:=z;
 end;

(*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the double-
| precision floating-point format.
*----------------------------------------------------------------------------*)
function commonNaNToFloat64( a : commonNaNT): float64;
 var
  z: float64;
 begin
    shift64Right( a.high, a.low, 12, z.high, z.low );
    z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
    result:=z;
 end;

(*----------------------------------------------------------------------------
| Takes two double-precision floating-point values `a' and `b', one of which
| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
| signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*)
Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
var
 aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
 begin
    aIsNaN := float64_is_nan( a );
    aIsSignalingNaN := float64_is_signaling_nan( a );
    bIsNaN := float64_is_nan( b );
    bIsSignalingNaN := float64_is_signaling_nan( b );
    a.high := a.high or $00080000;
    b.high := b.high or $00080000;
    if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
       float_raise( float_flag_invalid );
    if bIsSignalingNaN<>0 then
        c := b
    else if aIsSignalingNan<>0 then
        c := a
    else if bIsNan<>0 then
        c := b
    else
        c := a;
 end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
| otherwise returns 0.
*----------------------------------------------------------------------------*}

function float128_is_nan( a : float128): flag;
begin
    result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
        and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is a
| signaling NaN; otherwise returns 0.
*----------------------------------------------------------------------------*}

function float128_is_signaling_nan( a : float128): flag;
begin
    result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
        ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point NaN
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*}

function float128ToCommonNaN( a : float128): commonNaNT;
var
    z: commonNaNT;
    qhigh,qlow : qword;
begin
    if ( float128_is_signaling_nan( a )<>0)  then
      float_raise( float_flag_invalid );
    z.sign := a.high shr 63;
    shortShift128Left( a.high, a.low, 16, qhigh, qlow );
    z.high:=qhigh shr 32;
    z.low:=qhigh and $ffffffff;
    result:=z;
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the quadruple-
| precision floating-point format.
*----------------------------------------------------------------------------*}

function commonNaNToFloat128( a : commonNaNT): float128;
var
    z: float128;
begin
    shift128Right( a.high, a.low, 16, z.high, z.low );
    z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
    result:=z;
end;

{*----------------------------------------------------------------------------
| Takes two quadruple-precision floating-point values `a' and `b', one of
| which is a NaN, and returns the appropriate NaN result.  If either `a' or
| `b' is a signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*}

function propagateFloat128NaN( a: float128; b : float128): float128;
var
    aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
label
    returnLargerSignificand;
begin
    aIsNaN := float128_is_nan( a );
    aIsSignalingNaN := float128_is_signaling_nan( a );
    bIsNaN := float128_is_nan( b );
    bIsSignalingNaN := float128_is_signaling_nan( b );
    a.high := a.high or int64( $0000800000000000 );
    b.high := b.high or int64( $0000800000000000 );
    if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
       float_raise( float_flag_invalid );
    if ( aIsSignalingNaN )<>0 then
    begin
        if ( bIsSignalingNaN )<>0 then
          goto returnLargerSignificand;
        if bIsNaN<>0 then
          result := b
        else
          result := a;
        exit;
    end
    else if ( aIsNaN )<>0 then
    begin
        if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
          begin
          	result := a;
          	exit;
          end;
 returnLargerSignificand:
        if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
          begin
          	result := b;
          	exit;
          end;
        if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
          begin
          	result := a;
          	exit
          end;
        if ( a.high < b.high ) then
          result := a
        else
          result := b;
        exit;
    end
    else
    result:=b;
end;

{$ENDIF}

(****************************************************************************)
(*                        END ENDIAN SPECIFIC CODE                          *)
(****************************************************************************)


{*
-------------------------------------------------------------------------------
Returns the fraction bits of the single-precision floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
 Begin
    ExtractFloat32Frac := A AND $007FFFFF;
 End;

{*
-------------------------------------------------------------------------------
Returns the exponent bits of the single-precision floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function extractFloat32Exp( a: float32 ): Int16; inline;
  Begin
    extractFloat32Exp := (a shr 23) AND $FF;
  End;

{*
-------------------------------------------------------------------------------
Returns the sign bit of the single-precision floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function extractFloat32Sign( a: float32 ): Flag; inline;
  Begin
    extractFloat32Sign := a shr 31;
  End;

{*
-------------------------------------------------------------------------------
Normalizes the subnormal single-precision floating-point value represented
by the denormalized significand `aSig'.  The normalized exponent and
significand are stored at the locations pointed to by `zExpPtr' and
`zSigPtr', respectively.
-------------------------------------------------------------------------------
*}
Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
 Var
   ShiftCount : BYTE;
 Begin

    shiftCount := countLeadingZeros32( aSig ) - 8;
    zSigPtr := aSig shl shiftCount;
    zExpPtr := 1 - shiftCount;
  End;

{*
-------------------------------------------------------------------------------
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
single-precision floating-point value, returning the result.  After being
shifted into the proper positions, the three fields are simply added
together to form the result.  This means that any integer portion of `zSig'
will be added into the exponent.  Since a properly normalized significand
will have an integer portion equal to 1, the `zExp' input should be 1 less
than the desired result exponent whenever `zSig' is a complete, normalized
significand.
-------------------------------------------------------------------------------
*}
Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
 Begin

    packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
      + zSig;
 End;

{*
-------------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and significand `zSig', and returns the proper single-precision floating-
point value corresponding to the abstract input.  Ordinarily, the abstract
value is simply rounded and packed into the single-precision format, with
the inexact exception raised if the abstract input cannot be represented
exactly.  However, if the abstract value is too large, the overflow and
inexact exceptions are raised and an infinity or maximal finite value is
returned.  If the abstract value is too small, the input value is rounded to
a subnormal number, and the underflow and inexact exceptions are raised if
the abstract input cannot be represented exactly as a subnormal single-
precision floating-point number.
    The input significand `zSig' has its binary point between bits 30
and 29, which is 7 bits to the left of the usual location.  This shifted
significand must be normalized or smaller.  If `zSig' is not normalized,
`zExp' must be 0; in that case, the result returned is a subnormal number,
and it must not require rounding.  In the usual case that `zSig' is
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
The handling of underflow and overflow follows the IEC/IEEE Standard for
Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
 Var
   roundingMode : TFPURoundingMode;
   roundNearestEven : boolean;
   roundIncrement, roundBits : BYTE;
   IsTiny : boolean;
 Begin
    roundingMode := softfloat_rounding_mode;
    roundNearestEven := (roundingMode = float_round_nearest_even);
    roundIncrement := $40;
    if not roundNearestEven then
      Begin
        if ( roundingMode = float_round_to_zero ) Then
          Begin
            roundIncrement := 0;
          End
        else
          Begin
            roundIncrement := $7F;
            if ( zSign <> 0 ) then
              Begin
                if roundingMode = float_round_up then roundIncrement := 0;
              End
            else
              Begin
                if roundingMode = float_round_down then roundIncrement := 0;
              End;
         End
      End;
    roundBits := zSig AND $7F;
    if ($FD <= bits16 (zExp) ) then
     Begin
        if (( $FD < zExp ) OR  ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
          Begin
             float_raise( [float_flag_overflow,float_flag_inexact] );
             roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
             exit;
          End;
        if ( zExp < 0 ) then
          Begin
            isTiny :=
                   ( softfloat_detect_tininess = float_tininess_before_rounding )
                OR ( zExp < -1 )
                OR ( (zSig + roundIncrement) < $80000000 );
            shift32RightJamming( zSig, - zExp, zSig );
            zExp := 0;
            roundBits := zSig AND $7F;
            if ( isTiny and (roundBits<>0) ) then
               float_raise( float_flag_underflow );
          End;
    End;
    if ( roundBits )<> 0 then
       set_inexact_flag;
    zSig := ( zSig + roundIncrement ) shr 7;
    zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
    if ( zSig = 0 ) then zExp := 0;
    roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  End;

{*
-------------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and significand `zSig', and returns the proper single-precision floating-
point value corresponding to the abstract input.  This routine is just like
`roundAndPackFloat32' except that `zSig' does not have to be normalized.
Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
floating-point exponent.
-------------------------------------------------------------------------------
*}
Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  Var
    ShiftCount : int8;
  Begin
    shiftCount := countLeadingZeros32( zSig ) - 1;
    normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  End;

{*
-------------------------------------------------------------------------------
Returns the most-significant 20 fraction bits of the double-precision
floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function extractFloat64Frac0(a: float64): bits32; inline;
  Begin
    extractFloat64Frac0 := a.high and $000FFFFF;
  End;

{*
-------------------------------------------------------------------------------
Returns the least-significant 32 fraction bits of the double-precision
floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function extractFloat64Frac1(a: float64): bits32; inline;
  Begin
    extractFloat64Frac1 := a.low;
  End;


{$define FPC_SYSTEM_HAS_extractFloat64Frac}
Function extractFloat64Frac(a: float64): bits64; inline;
  Begin
    extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  End;

{*
-------------------------------------------------------------------------------
Returns the exponent bits of the double-precision floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function extractFloat64Exp(a: float64): int16; inline;
 Begin
    extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
 End;

{*
-------------------------------------------------------------------------------
Returns the sign bit of the double-precision floating-point value `a'.
-------------------------------------------------------------------------------
*}
Function extractFloat64Sign(a: float64) : flag; inline;
 Begin
    extractFloat64Sign := a.high shr 31;
 End;

{*
-------------------------------------------------------------------------------
Normalizes the subnormal double-precision floating-point value represented
by the denormalized significand formed by the concatenation of `aSig0' and
`aSig1'.  The normalized exponent is stored at the location pointed to by
`zExpPtr'.  The most significant 21 bits of the normalized significand are
stored at the location pointed to by `zSig0Ptr', and the least significant
32 bits of the normalized significand are stored at the location pointed to
by `zSig1Ptr'.
-------------------------------------------------------------------------------
*}
Procedure normalizeFloat64Subnormal(
     aSig0: bits32;
     aSig1: bits32;
     VAR zExpPtr : Int16;
     VAR zSig0Ptr : Bits32;
     VAR zSig1Ptr : Bits32
 );
 Var
  ShiftCount : Int8;
 Begin
    if ( aSig0 = 0 ) then
      Begin
        shiftCount := countLeadingZeros32( aSig1 ) - 11;
        if ( shiftCount < 0 ) then
          Begin
            zSig0Ptr := aSig1 shr ( - shiftCount );
            zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
          End
        else
           Begin
            zSig0Ptr := aSig1 shl shiftCount;
            zSig1Ptr := 0;
           End;
        zExpPtr := - shiftCount - 31;
      End
    else
      Begin
        shiftCount := countLeadingZeros32( aSig0 ) - 11;
        shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
        zExpPtr := 1 - shiftCount;
      End;
  End;

procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
var
  shiftCount : int8;
begin
    shiftCount := countLeadingZeros64( aSig ) - 11;
    zSigPtr := aSig shl shiftCount;
    zExpPtr := 1 - shiftCount;
end;


{*
-------------------------------------------------------------------------------
Packs the sign `zSign', the exponent `zExp', and the significand formed by
the concatenation of `zSig0' and `zSig1' into a double-precision floating-
point value, returning the result.  After being shifted into the proper
positions, the three fields `zSign', `zExp', and `zSig0' are simply added
together to form the most significant 32 bits of the result.  This means
that any integer portion of `zSig0' will be added into the exponent.  Since
a properly normalized significand will have an integer portion equal to 1,
the `zExp' input should be 1 less than the desired result exponent whenever
`zSig0' and `zSig1' concatenated form a complete, normalized significand.
-------------------------------------------------------------------------------
*}
Procedure
 packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
 var
    z: Float64;
 Begin

    z.low := zSig1;
    z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
    c := z;
 End;


{*----------------------------------------------------------------------------
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
| double-precision floating-point value, returning the result.  After being
| shifted into the proper positions, the three fields are simply added
| together to form the result.  This means that any integer portion of `zSig'
| will be added into the exponent.  Since a properly normalized significand
| will have an integer portion equal to 1, the `zExp' input should be 1 less
| than the desired result exponent whenever `zSig' is a complete, normalized
| significand.
*----------------------------------------------------------------------------*}

function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
begin
    result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
end;

{*
-------------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and extended significand formed by the concatenation of `zSig0', `zSig1',
and `zSig2', and returns the proper double-precision floating-point value
corresponding to the abstract input.  Ordinarily, the abstract value is
simply rounded and packed into the double-precision format, with the inexact
exception raised if the abstract input cannot be represented exactly.
However, if the abstract value is too large, the overflow and inexact
exceptions are raised and an infinity or maximal finite value is returned.
If the abstract value is too small, the input value is rounded to a
subnormal number, and the underflow and inexact exceptions are raised if the
abstract input cannot be represented exactly as a subnormal double-precision
floating-point number.
    The input significand must be normalized or smaller.  If the input
significand is not normalized, `zExp' must be 0; in that case, the result
returned is a subnormal number, and it must not require rounding.  In the
usual case that the input significand is normalized, `zExp' must be 1 less
than the ``true'' floating-point exponent.  The handling of underflow and
overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Procedure
 roundAndPackFloat64(
     zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
 Var
   roundingMode : TFPURoundingMode;
   roundNearestEven, increment, isTiny : Flag;
 Begin

    roundingMode := softfloat_rounding_mode;
    roundNearestEven := flag( roundingMode = float_round_nearest_even );
    increment := flag( sbits32 (zSig2) < 0 );
    if ( roundNearestEven  = flag(FALSE) ) then
      Begin
        if ( roundingMode = float_round_to_zero ) then
            increment := 0
        else
          Begin
            if ( zSign )<> 0 then
              Begin
                increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
              End
            else
              Begin
                increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
              End
          End
      End;
    if ( $7FD <= bits16 (zExp) ) then
      Begin
        if (( $7FD < zExp )
             or (( zExp = $7FD )
                   and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
                   and (increment<>0)
                )
           ) then
           Begin
            float_raise( [float_flag_overflow,float_flag_inexact] );
            if (( roundingMode = float_round_to_zero )
                 or ( (zSign<>0) and ( roundingMode = float_round_up ) )
                 or ( (zSign = 0) and ( roundingMode = float_round_down ) )
               ) then
              Begin
                packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
                exit;
              End;
            packFloat64( zSign, $7FF, 0, 0, c );
            exit;
           End;
        if ( zExp < 0 ) then
           Begin
            isTiny :=
                   flag( softfloat_detect_tininess = float_tininess_before_rounding )
                or flag( zExp < -1 )
                or  flag(increment = 0)
                or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
            shift64ExtraRightJamming(
                zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
            zExp := 0;
            if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
            if ( roundNearestEven )<>0 then
              Begin
                increment := flag( sbits32 (zSig2) < 0 );
              End
            else
              Begin
                if ( zSign )<>0 then
                  Begin
                    increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
                  End
                else
                  Begin
                    increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
                  End
              End;
        End;
    End;
    if ( zSig2 )<>0 then
       set_inexact_flag;
    if ( increment )<>0 then
      Begin
        add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
        zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
      End
    else
      Begin
        if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
      End;
    packFloat64( zSign, zExp, zSig0, zSig1, c );
 End;

{*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and significand `zSig', and returns the proper double-precision floating-
| point value corresponding to the abstract input.  Ordinarily, the abstract
| value is simply rounded and packed into the double-precision format, with
| the inexact exception raised if the abstract input cannot be represented
| exactly.  However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned.  If the abstract value is too small, the input value is rounded
| to a subnormal number, and the underflow and inexact exceptions are raised
| if the abstract input cannot be represented exactly as a subnormal double-
| precision floating-point number.
|     The input significand `zSig' has its binary point between bits 62
| and 61, which is 10 bits to the left of the usual location.  This shifted
| significand must be normalized or smaller.  If `zSig' is not normalized,
| `zExp' must be 0; in that case, the result returned is a subnormal number,
| and it must not require rounding.  In the usual case that `zSig' is
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
| The handling of underflow and overflow follows the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
var
    roundingMode: TFPURoundingMode;
    roundNearestEven: flag;
    roundIncrement, roundBits: int16;
    isTiny: flag;
begin
    roundingMode := softfloat_rounding_mode;
    roundNearestEven := ord( roundingMode = float_round_nearest_even );
    roundIncrement := $200;
    if ( roundNearestEven=0 ) then
    begin
        if ( roundingMode = float_round_to_zero ) then
        begin
            roundIncrement := 0;
        end
        else begin
            roundIncrement := $3FF;
            if ( zSign<>0 ) then
            begin
                if ( roundingMode = float_round_up ) then
                  roundIncrement := 0;
            end
            else begin
                if ( roundingMode = float_round_down ) then
                  roundIncrement := 0;
            end
        end
    end;
    roundBits := zSig and $3FF;
    if ( $7FD <= bits16(zExp) ) then
    begin
        if (    ( $7FD < zExp )
             or (    ( zExp = $7FD )
                  and ( sbits64( zSig + roundIncrement ) < 0 ) )
           ) then
           begin
            float_raise( [float_flag_overflow,float_flag_inexact] );
            result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
            exit;
        end;
        if ( zExp < 0 ) then
        begin
            isTiny := ord(
                   ( softfloat_detect_tininess = float_tininess_before_rounding )
                or ( zExp < -1 )
                or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
            shift64RightJamming( zSig, - zExp, zSig );
            zExp := 0;
            roundBits := zSig and $3FF;
            if ( isTiny and roundBits )<>0 then
              float_raise( float_flag_underflow );
        end
    end;
    if ( roundBits<>0 ) then
      set_inexact_flag;
    zSig := ( zSig + roundIncrement ) shr 10;
    zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
    if ( zSig = 0 ) then
      zExp := 0;
    result:=packFloat64( zSign, zExp, zSig );
end;

{*
-------------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and significand formed by the concatenation of `zSig0' and `zSig1', and
returns the proper double-precision floating-point value corresponding
to the abstract input.  This routine is just like `roundAndPackFloat64'
except that the input significand has fewer bits and does not have to be
normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
point exponent.
-------------------------------------------------------------------------------
*}
Procedure
 normalizeRoundAndPackFloat64(
     zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
 Var
   shiftCount : int8;
   zSig2 : bits32;
 Begin

    if ( zSig0 = 0 ) then
     Begin
        zSig0 := zSig1;
        zSig1 := 0;
        zExp := zExp -32;
     End;
    shiftCount := countLeadingZeros32( zSig0 ) - 11;
    if ( 0 <= shiftCount ) then
      Begin
        zSig2 := 0;
        shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
      End
    else
      Begin
        shift64ExtraRightJamming
          (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
      End;
    zExp := zExp - shiftCount;
    roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  End;

{*
----------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and significand `zSig', and returns the proper double-precision floating-
point value corresponding to the abstract input.  This routine is just like
`roundAndPackFloat64' except that `zSig' does not have to be normalized.
Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
floating-point exponent.
----------------------------------------------------------------------------
*}

function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  var
    shiftCount: int8;
  begin
    shiftCount := countLeadingZeros64( zSig ) - 1;
    result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  end;

{*
-------------------------------------------------------------------------------
Returns the result of converting the 32-bit two's complement integer `a' to
the single-precision floating-point format.  The conversion is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
 Var
  zSign : Flag;
 Begin

    if ( a = 0 ) then
      Begin
       int32_to_float32.float32 := 0;
       exit;
      End;
    if ( a = sbits32 ($80000000) ) then
      Begin
       int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
       exit;
      end;
    zSign := flag( a < 0 );
    If zSign<>0 then
      a := -a;
    int32_to_float32.float32:=
      normalizeRoundAndPackFloat32( zSign, $9C, a );
 End;


{*
-------------------------------------------------------------------------------
Returns the result of converting the 32-bit two's complement integer `a' to
the double-precision floating-point format.  The conversion is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function int32_to_float64( a: int32) : float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  var
    zSign : flag;
    absA : bits32;
    shiftCount : int8;
    zSig0, zSig1 : bits32;
  Begin

    if ( a = 0 ) then
      Begin
       packFloat64( 0, 0, 0, 0, result );
       exit;
      end;
    zSign := flag( a < 0 );
    if ZSign<>0 then
      AbsA := -a
    else
      AbsA := a;
    shiftCount := countLeadingZeros32( absA ) - 11;
    if ( 0 <= shiftCount ) then
      Begin
        zSig0 := absA shl shiftCount;
        zSig1 := 0;
      End
    else
      Begin
        shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
      End;
    packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  End;

{$ifdef FPC_SOFTFLOAT_FLOATX80}

{$if not defined(packFloatx80)}
function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
forward;
{$endif}

{*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a'
| to the extended double-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function int32_to_floatx80( a: int32 ): floatx80;
var
    zSign: flag;
    absA: uint32;
    shiftCount: int8;
    zSig: bits64;
begin
    if ( a = 0 ) then begin
        result := packFloatx80( 0, 0, 0 );
        exit;
    end;
    zSign := ord( a < 0 );
    if zSign <> 0 then absA := - a else absA := a;
    shiftCount := countLeadingZeros32( absA ) + 32;
    zSig := absA;
    result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
end;

{$endif FPC_SOFTFLOAT_FLOATX80}

{$ifdef FPC_SOFTFLOAT_FLOAT128}

{$if not defined(packFloat128)}
function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
forward;
{$endif}

{*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a' to
| the quadruple-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function int32_to_float128( a: int32 ): float128;
var
    zSign: flag;
    absA: uint32;
    shiftCount: int8;
    zSig0: bits64;
begin
    if ( a = 0 ) then begin
        result := packFloat128( 0, 0, 0, 0 );
        exit;
    end;
    zSign := ord( a < 0 );
    if zSign <> 0 then absA := - a else absA := a;
    shiftCount := countLeadingZeros32( absA ) + 17;
    zSig0 := absA;
    result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
end;

{$endif FPC_SOFTFLOAT_FLOAT128}

{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic---which means in particular that the conversion is rounded
according to the current rounding mode.  If `a' is a NaN, the largest
positive integer is returned.  Otherwise, if the conversion overflows, the
largest integer with the same sign as `a' is returned.
-------------------------------------------------------------------------------
*}
Function float32_to_int32( a : float32rec) : int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig, aSigExtra: bits32;
    z: int32;
    roundingMode: TFPURoundingMode;
  Begin

    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    shiftCount := aExp - $96;
    if ( 0 <= shiftCount ) then
      Begin
        if ( $9E <= aExp ) then
          Begin
            if ( a.float32 <> $CF000000 ) then
              Begin
                float_raise( float_flag_invalid );
                if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
                  Begin
                    float32_to_int32 := $7FFFFFFF;
                    exit;
                  End;
              End;
            float32_to_int32 := sbits32 ($80000000);
            exit;
          End;
        z := ( aSig or $00800000 ) shl shiftCount;
        if ( aSign<>0 ) then z := - z;
      End
    else
      Begin
        if ( aExp < $7E ) then
          Begin
            aSigExtra := aExp OR aSig;
            z := 0;
          End
        else
         Begin
            aSig := aSig OR $00800000;
            aSigExtra := aSig shl ( shiftCount and 31 );
            z := aSig shr ( - shiftCount );
         End;
        if ( aSigExtra<>0 ) then
          set_inexact_flag;
        roundingMode := softfloat_rounding_mode;
        if ( roundingMode = float_round_nearest_even ) then
          Begin
            if ( sbits32 (aSigExtra) < 0 ) then
              Begin
                Inc(z);
                if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
                  z := z and not 1;
              End;
              if ( aSign<>0 ) then
                z := - z;
          End
        else
          Begin
            aSigExtra := flag( aSigExtra <> 0 );
            if ( aSign<>0 ) then
             Begin
                z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
                z := - z;
             End
            else
             Begin
                z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
             End
          End;
      End;
   float32_to_int32 := z;
  End;

{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic, except that the conversion is always rounded toward zero.
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
the conversion overflows, the largest integer with the same sign as `a' is
returned.
-------------------------------------------------------------------------------
*}
Function float32_to_int32_round_to_zero( a: Float32rec ): int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
 Var
    aSign : flag;
    aExp, shiftCount : int16;
    aSig : bits32;
    z : int32;
 Begin
    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    shiftCount := aExp - $9E;
    if ( 0 <= shiftCount ) then
      Begin
        if ( a.float32 <> $CF000000 ) then
          Begin
            float_raise( float_flag_invalid );
            if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
              Begin
                float32_to_int32_round_to_zero := $7FFFFFFF;
                exit;
              end;
          End;
        float32_to_int32_round_to_zero:= sbits32 ($80000000);
        exit;
      End
    else
      if ( aExp <= $7E ) then
      Begin
        if ( aExp or aSig )<>0 then
           set_inexact_flag;
        float32_to_int32_round_to_zero := 0;
        exit;
      End;
    aSig := ( aSig or $00800000 ) shl 8;
    z := aSig shr ( - shiftCount );
    if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
      Begin
        set_inexact_flag;
      End;
    if ( aSign<>0 ) then z := - z;
    float32_to_int32_round_to_zero := z;
 End;

{*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the 64-bit two's complement integer format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic---which means in particular that the conversion is rounded
| according to the current rounding mode.  If `a' is a NaN, the largest
| positive integer is returned.  Otherwise, if the conversion overflows, the
| largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*}

function float32_to_int64( a: float32 ): int64;
var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig: bits32;
    aSig64, aSigExtra: bits64;
begin
    aSig := extractFloat32Frac( a );
    aExp := extractFloat32Exp( a );
    aSign := extractFloat32Sign( a );
    shiftCount := $BE - aExp;
    if ( shiftCount < 0 ) then begin
        float_raise( float_flag_invalid );
        if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
            result := $7FFFFFFFFFFFFFFF;
            exit;
        end;
        result := $8000000000000000;
        exit;
    end;
    if ( aExp <> 0 ) then aSig := aSig or $00800000;
    aSig64 := aSig;
    aSig64 := aSig64 shl 40;
    shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
    result := roundAndPackInt64( aSign, aSig64, aSigExtra );
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the 64-bit two's complement integer format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic, except that the conversion is always rounded toward zero.  If
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
| conversion overflows, the largest integer with the same sign as `a' is
| returned.
*----------------------------------------------------------------------------*}

function float32_to_int64_round_to_zero( a: float32 ): int64;
var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig: bits32;
    aSig64: bits64;
    z: int64;
begin
    aSig := extractFloat32Frac( a );
    aExp := extractFloat32Exp( a );
    aSign := extractFloat32Sign( a );
    shiftCount := aExp - $BE;
    if ( 0 <= shiftCount ) then begin
        if ( a <> $DF000000 ) then begin
            float_raise( float_flag_invalid );
            if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
                result := $7FFFFFFFFFFFFFFF;
                exit;
            end;
        end;
        result := $8000000000000000;
        exit;
    end
    else if ( aExp <= $7E ) then begin
        if ( aExp or aSig <> 0 ) then set_inexact_flag;
        result := 0;
        exit;
    end;
    aSig64 := aSig or $00800000;
    aSig64 := aSig64 shl 40;
    z := aSig64 shr ( - shiftCount );
    if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
        set_inexact_flag;
    if ( aSign <> 0 ) then z := - z;
    result := z;
end;

{*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the double-precision floating-point format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_to_float64( a : float32rec) : Float64;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign : flag;
    aExp : int16;
    aSig, zSig0, zSig1: bits32;
    tmp : CommonNanT;
  Begin
    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    if ( aExp = $FF ) then
      Begin
        if ( aSig<>0 ) then
          Begin
            tmp:=float32ToCommonNaN(a.float32);
            result:=commonNaNToFloat64(tmp);
            exit;
          End;
          packFloat64( aSign, $7FF, 0, 0, result);
          exit;
      End;
    if ( aExp = 0 ) then
      Begin
        if ( aSig = 0 ) then
          Begin
            packFloat64( aSign, 0, 0, 0, result );
            exit;
          end;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
        Dec(aExp);
      End;
    shift64Right( aSig, 0, 3, zSig0, zSig1 );
    packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  End;

{$ifdef FPC_SOFTFLOAT_FLOATX80}

{*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the extended
| double-precision floating-point format.
*----------------------------------------------------------------------------*}

function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
var
    z : floatx80;
begin
    z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
    z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
    result := z;
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the extended double-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function float32_to_floatx80( a: float32 ): floatx80;
var
    aSign: flag;
    aExp: int16;
    aSig: bits32;
    tmp: commonNaNT;
begin
    aSig := extractFloat32Frac( a );
    aExp := extractFloat32Exp( a );
    aSign := extractFloat32Sign( a );
    if ( aExp = $FF ) then begin
        if ( aSig <> 0 ) then begin
            tmp:=float32ToCommonNaN(a);
            result := commonNaNToFloatx80( tmp );
            exit;
        end;
        result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( aSig = 0 ) then begin
            result := packFloatx80( aSign, 0, 0 );
            exit;
        end;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
    end;
    aSig := aSig or $00800000;
    result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
end;

{$endif FPC_SOFTFLOAT_FLOATX80}

{$ifdef FPC_SOFTFLOAT_FLOAT128}

{*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the double-precision floating-point format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function float32_to_float128( a: float32 ): float128;
var
    aSign: flag;
    aExp: int16;
    aSig: bits32;
    tmp: commonNaNT;
begin
    aSig := extractFloat32Frac( a );
    aExp := extractFloat32Exp( a );
    aSign := extractFloat32Sign( a );
    if ( aExp = $FF ) then begin
        if ( aSig <> 0 ) then begin
            tmp:=float32ToCommonNaN(a);
            result := commonNaNToFloat128( tmp );
            exit;
        end;
        result := packFloat128( aSign, $7FFF, 0, 0 );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( aSig = 0 ) then begin
            result := packFloat128( aSign, 0, 0, 0 );
            exit;
        end;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
        dec( aExp );
    end;
    result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
end;

{$endif FPC_SOFTFLOAT_FLOAT128}

{*
-------------------------------------------------------------------------------
Rounds the single-precision floating-point value `a' to an integer,
and returns the result as a single-precision floating-point value.  The
operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_round_to_int( a: float32rec): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign: flag;
    aExp: int16;
    lastBitMask, roundBitsMask: bits32;
    roundingMode: TFPURoundingMode;
    z: float32;
  Begin
    aExp := extractFloat32Exp( a.float32 );
    if ( $96 <= aExp ) then
     Begin
        if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
          Begin
            float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
            exit;
          End;
        float32_round_to_int:=a;
        exit;
     End;
    if ( aExp <= $7E ) then
      Begin
        if ( bits32 ( a.float32 shl 1 ) = 0 ) then
          Begin
             float32_round_to_int:=a;
             exit;
          end;
        set_inexact_flag;
        aSign := extractFloat32Sign( a.float32 );

        case ( softfloat_rounding_mode ) of
         float_round_nearest_even:
            Begin
              if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
                Begin
                  float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
                  exit;
                End;
            End;
         float_round_down:
            Begin
              if aSign <> 0 then
                 float32_round_to_int.float32 := $BF800000
              else
                 float32_round_to_int.float32 := 0;
              exit;
            End;
         float_round_up:
            Begin
              if aSign <> 0 then
                 float32_round_to_int.float32 := $80000000
              else
                 float32_round_to_int.float32 := $3F800000;
              exit;
            End;
        end;
        float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
        exit;
      End;
    lastBitMask := 1;
    {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
    lastBitMask := lastBitMask shl ($96 - aExp);
    roundBitsMask := lastBitMask - 1;
    z := a.float32;
    roundingMode := softfloat_rounding_mode;
    if ( roundingMode = float_round_nearest_even ) then
      Begin
        z := z + (lastBitMask shr 1);
        if ( ( z and roundBitsMask ) = 0 ) then
           z := z and not lastBitMask;
      End
    else if ( roundingMode <> float_round_to_zero ) then
      Begin
        if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
          Begin
            z := z + roundBitsMask;
          End;
      End;
    z := z and not roundBitsMask;
    if ( z <> a.float32 ) then
      set_inexact_flag;
    float32_round_to_int.float32 := z;
  End;

{*
-------------------------------------------------------------------------------
Returns the result of adding the absolute values of the single-precision
floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
before being returned.  `zSign' is ignored if the result is a NaN.
The addition is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  Var
    aExp, bExp, zExp: int16;
    aSig, bSig, zSig: bits32;
    expDiff: int16;
    label roundAndPack;
  Begin
    aSig:=extractFloat32Frac( a );
    aExp:=extractFloat32Exp( a );
    bSig:=extractFloat32Frac( b );
    bExp := extractFloat32Exp( b );
    expDiff := aExp - bExp;
    aSig := aSig shl 6;
    bSig := bSig shl 6;
    if ( 0 < expDiff ) then
    Begin
        if ( aExp = $FF ) then
          Begin
            if ( aSig <> 0) then
              Begin
                addFloat32Sigs := propagateFloat32NaN( a, b );
                exit;
              End;
            addFloat32Sigs := a;
            exit;
          End;
        if ( bExp = 0 ) then
          Begin
             Dec(expDiff);
          End
        else
          Begin
            bSig := bSig or $20000000;
          End;
        shift32RightJamming( bSig, expDiff, bSig );
        zExp := aExp;
    End
    else
    If ( expDiff < 0 ) then
      Begin
        if ( bExp = $FF ) then
        Begin
            if ( bSig<>0 ) then
              Begin
                addFloat32Sigs := propagateFloat32NaN( a, b );
                exit;
              end;

            addFloat32Sigs := packFloat32( zSign, $FF, 0 );
            exit;
        End;
        if ( aExp = 0 ) then
          Begin
            Inc(expDiff);
          End
        else
          Begin
            aSig := aSig OR $20000000;
          End;
        shift32RightJamming( aSig, - expDiff, aSig );
        zExp := bExp;
    End
    else
    Begin
        if ( aExp = $FF ) then
        Begin
            if ( aSig OR  bSig )<> 0 then
              Begin
                addFloat32Sigs := propagateFloat32NaN( a, b );
                exit;
              end;
            addFloat32Sigs := a;
            exit;
        End;
        if ( aExp = 0 ) then
          Begin
             addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
             exit;
          end;
        zSig := $40000000 + aSig + bSig;
        zExp := aExp;
        goto roundAndPack;
    End;
    aSig := aSig OR $20000000;
    zSig := ( aSig + bSig ) shl 1;
    Dec(zExp);
    if ( sbits32 (zSig) < 0 ) then
      Begin
        zSig := aSig + bSig;
        Inc(zExp);
      End;
 roundAndPack:
    addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
 End;

{*
-------------------------------------------------------------------------------
Returns the result of subtracting the absolute values of the single-
precision floating-point values `a' and `b'.  If `zSign' is 1, the
difference is negated before being returned.  `zSign' is ignored if the
result is a NaN.  The subtraction is performed according to the IEC/IEEE
Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  Var
    aExp, bExp, zExp: int16;
    aSig, bSig, zSig: bits32;
    expDiff : int16;
    label aExpBigger;
    label bExpBigger;
    label aBigger;
    label bBigger;
    label normalizeRoundAndPack;
  Begin
    aSig := extractFloat32Frac( a );
    aExp := extractFloat32Exp( a );
    bSig := extractFloat32Frac( b );
    bExp := extractFloat32Exp( b );
    expDiff := aExp - bExp;
    aSig := aSig shl 7;
    bSig := bSig shl 7;
    if ( 0 < expDiff ) then goto aExpBigger;
    if ( expDiff < 0 ) then goto bExpBigger;
    if ( aExp = $FF ) then
    Begin
        if ( aSig OR  bSig )<> 0 then
          Begin
           subFloat32Sigs := propagateFloat32NaN( a, b );
           exit;
          End;
        float_raise( float_flag_invalid );
        subFloat32Sigs := float32_default_nan;
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        aExp := 1;
        bExp := 1;
    End;
    if ( bSig < aSig ) Then goto aBigger;
    if ( aSig < bSig ) Then goto bBigger;
    subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
    exit;
 bExpBigger:
    if ( bExp = $FF ) then
    Begin
        if ( bSig<>0 ) then
        Begin
          subFloat32Sigs := propagateFloat32NaN( a, b );
          exit;
        End;
        subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
        exit;
    End;
    if ( aExp = 0 ) then
      Begin
        Inc(expDiff);
      End
    else
      Begin
        aSig := aSig OR $40000000;
      End;
    shift32RightJamming( aSig, - expDiff, aSig );
    bSig := bSig OR $40000000;
 bBigger:
    zSig := bSig - aSig;
    zExp := bExp;
    zSign := zSign xor 1;
    goto normalizeRoundAndPack;
 aExpBigger:
    if ( aExp = $FF ) then
      Begin
        if ( aSig <> 0) then
          Begin
            subFloat32Sigs := propagateFloat32NaN( a, b );
            exit;
          End;
        subFloat32Sigs := a;
        exit;
      End;
    if ( bExp = 0 ) then
      Begin
        Dec(expDiff);
      End
    else
      Begin
        bSig := bSig OR $40000000;
      End;
    shift32RightJamming( bSig, expDiff, bSig );
    aSig := aSig OR $40000000;
 aBigger:
    zSig := aSig - bSig;
    zExp := aExp;
 normalizeRoundAndPack:
    Dec(zExp);
    subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  End;

{*
-------------------------------------------------------------------------------
Returns the result of adding the single-precision floating-point values `a'
and `b'.  The operation is performed according to the IEC/IEEE Standard for
Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign, bSign: Flag;
  Begin
    aSign := extractFloat32Sign( a.float32 );
    bSign := extractFloat32Sign( b.float32 );
    if ( aSign = bSign ) then
      Begin
        float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
      End
    else
      Begin
        float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
      End;
  End;

{*
-------------------------------------------------------------------------------
Returns the result of subtracting the single-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_sub( a: float32rec ; b:float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign, bSign: flag;
  Begin
    aSign := extractFloat32Sign( a.float32 );
    bSign := extractFloat32Sign( b.float32 );
    if ( aSign = bSign ) then
      Begin
        float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
      End
    else
      Begin
        float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
      End;
  End;

{*
-------------------------------------------------------------------------------
Returns the result of multiplying the single-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}

  Var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp : int16;
    aSig, bSig, zSig0, zSig1: bits32;
  Begin
    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    bSig := extractFloat32Frac( b.float32 );
    bExp := extractFloat32Exp( b.float32 );
    bSign := extractFloat32Sign( b.float32 );
    zSign := aSign xor bSign;
    if ( aExp = $FF ) then
    Begin
        if ( (aSig<>0) OR ( ( bExp = $FF ) AND  (bSig<>0) ) ) then
        Begin
            float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
            exit;
        End;
        if ( ( bits32(bExp) OR  bSig ) = 0 ) then
        Begin
            float_raise( float_flag_invalid );
            float32_mul.float32 := float32_default_nan;
            exit;
        End;
        float32_mul.float32 := packFloat32( zSign, $FF, 0 );
        exit;
    End;
    if ( bExp = $FF ) then
    Begin
        if ( bSig <> 0 ) then
        Begin
           float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
           exit;
        End;
        if ( ( bits32(aExp) OR  aSig ) = 0 ) then
        Begin
            float_raise( float_flag_invalid );
            float32_mul.float32 := float32_default_nan;
            exit;
        End;
        float32_mul.float32 := packFloat32( zSign, $FF, 0 );
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        if ( aSig = 0 ) then
        Begin
           float32_mul.float32 := packFloat32( zSign, 0, 0 );
           exit;
        End;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
    End;
    if ( bExp = 0 ) then
    Begin
        if ( bSig = 0 ) then
         Begin
           float32_mul.float32 := packFloat32( zSign, 0, 0 );
           exit;
         End;
        normalizeFloat32Subnormal( bSig, bExp, bSig );
    End;
    zExp := aExp + bExp - $7F;
    aSig := ( aSig OR  $00800000 ) shl 7;
    bSig := ( bSig OR  $00800000 ) shl 8;
    mul32To64( aSig, bSig, zSig0, zSig1 );
    zSig0 := zSig0 OR bits32( zSig1 <> 0 );
    if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
    Begin
        zSig0 := zSig0 shl 1;
        Dec(zExp);
    End;
    float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
 End;

{*
-------------------------------------------------------------------------------
Returns the result of dividing the single-precision floating-point value `a'
by the corresponding value `b'.  The operation is performed according to the
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int16;
    aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  Begin
    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    bSig := extractFloat32Frac( b.float32 );
    bExp := extractFloat32Exp( b.float32 );
    bSign := extractFloat32Sign( b.float32 );
    zSign := aSign xor bSign;
    if ( aExp = $FF ) then
      Begin
        if ( aSig <> 0 ) then
        Begin
           float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
           exit;
        End;
        if ( bExp = $FF ) then
        Begin
            if ( bSig <> 0) then
            Begin
              float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
              exit;
            End;
            float_raise( float_flag_invalid );
            float32_div.float32 := float32_default_nan;
            exit;
        End;
        float32_div.float32 := packFloat32( zSign, $FF, 0 );
        exit;
      End;
    if ( bExp = $FF ) then
    Begin
        if ( bSig <> 0) then
        Begin
          float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
          exit;
        End;
        float32_div.float32 := packFloat32( zSign, 0, 0 );
        exit;
    End;
    if ( bExp = 0 ) Then
    Begin
        if ( bSig = 0 ) Then
        Begin
            if ( ( bits32(aExp) OR  aSig ) = 0 ) then
            Begin
                float_raise( float_flag_invalid );
                float32_div.float32 := float32_default_nan;
                exit;
            End;
            float_raise( float_flag_divbyzero );
            float32_div.float32 := packFloat32( zSign, $FF, 0 );
            exit;
        End;
        normalizeFloat32Subnormal( bSig, bExp, bSig );
    End;
    if ( aExp = 0 ) Then
    Begin
        if ( aSig = 0 ) Then
        Begin
          float32_div.float32 := packFloat32( zSign, 0, 0 );
          exit;
        End;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
    End;
    zExp := aExp - bExp + $7D;
    aSig := ( aSig OR  $00800000 ) shl 7;
    bSig := ( bSig OR  $00800000 ) shl 8;
    if ( bSig <= ( aSig + aSig ) ) then
    Begin
        aSig := aSig shr 1;
        Inc(zExp);
    End;
    zSig := estimateDiv64To32( aSig, 0, bSig );
    if ( ( zSig and $3F ) <= 2 ) then
    Begin
        mul32To64( bSig, zSig, term0, term1 );
        sub64( aSig, 0, term0, term1, rem0, rem1 );
        while ( sbits32 (rem0) < 0 ) do
        Begin
            Dec(zSig);
            add64( rem0, rem1, 0, bSig, rem0, rem1 );
        End;
        zSig := zSig or bits32( rem1 <> 0 );
    End;
    float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );

  End;

{*
-------------------------------------------------------------------------------
Returns the remainder of the single-precision floating-point value `a'
with respect to the corresponding value `b'.  The operation is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  Var
    aSign, zSign: flag;
    aExp, bExp, expDiff: int16;
    aSig, bSig, q, alternateASig: bits32;
    sigMean: sbits32;
  Begin
    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    bSig := extractFloat32Frac( b.float32 );
    bExp := extractFloat32Exp( b.float32 );
    if ( aExp = $FF ) then
    Begin
        if ( (aSig<>0) OR ( ( bExp = $FF ) AND  (bSig <>0)) ) then
        Begin
            float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
            exit;
        End;
        float_raise( float_flag_invalid );
        float32_rem.float32 := float32_default_nan;
        exit;
    End;
    if ( bExp = $FF ) then
    Begin
        if ( bSig <> 0 ) then
        Begin
          float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
          exit;
        End;
        float32_rem := a;
        exit;
    End;
    if ( bExp = 0 ) then
    Begin
        if ( bSig = 0 ) then
        Begin
            float_raise( float_flag_invalid );
            float32_rem.float32 := float32_default_nan;
            exit;
        End;
        normalizeFloat32Subnormal( bSig, bExp, bSig );
    End;
    if ( aExp = 0 ) then
    Begin
        if ( aSig = 0 ) then
        Begin
           float32_rem := a;
           exit;
        End;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
    End;
    expDiff := aExp - bExp;
    aSig := ( aSig OR  $00800000 ) shl 8;
    bSig := ( bSig OR  $00800000 ) shl 8;
    if ( expDiff < 0 ) then
    Begin
        if ( expDiff < -1 ) then
        Begin
           float32_rem := a;
           exit;
        End;
        aSig := aSig shr 1;
    End;
    q := bits32( bSig <= aSig );
    if ( q <> 0) then
       aSig := aSig - bSig;
    expDiff := expDiff - 32;
    while ( 0 < expDiff ) do
    Begin
        q := estimateDiv64To32( aSig, 0, bSig );
        if (2 < q) then
         q := q - 2
        else
         q := 0;
        aSig := - ( ( bSig shr 2 ) * q );
        expDiff := expDiff - 30;
    End;
    expDiff := expDiff + 32;
    if ( 0 < expDiff ) then
    Begin
        q := estimateDiv64To32( aSig, 0, bSig );
        if (2 < q) then
         q := q - 2
        else
         q := 0;
        q := q shr (32 - expDiff);
        bSig := bSig shr 2;
        aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
    End
    else
    Begin
        aSig := aSig shr 2;
        bSig := bSig shr 2;
    End;
    Repeat
        alternateASig := aSig;
        Inc(q);
        aSig := aSig - bSig;
    Until not ( 0 <= sbits32 (aSig) );
    sigMean := aSig + alternateASig;
    if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND  (( q and 1 )<>0) ) ) then
    Begin
        aSig := alternateASig;
    End;
    zSign := flag( sbits32 (aSig) < 0 );
    if ( zSign<>0 ) then
      aSig := - aSig;
    float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  End;

{*
-------------------------------------------------------------------------------
Returns the square root of the single-precision floating-point value `a'.
The operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_sqrt(a: float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
Var
    aSign : flag;
    aExp, zExp : int16;
    aSig, zSig, rem0, rem1, term0, term1: bits32;
    label roundAndPack;
Begin
    aSig := extractFloat32Frac( a.float32 );
    aExp := extractFloat32Exp( a.float32 );
    aSign := extractFloat32Sign( a.float32 );
    if ( aExp = $FF ) then
    Begin
        if ( aSig <> 0) then
        Begin
           float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
           exit;
        End;
        if ( aSign = 0) then
        Begin
          float32_sqrt := a;
          exit;
        End;
        float_raise( float_flag_invalid );
        float32_sqrt.float32 := float32_default_nan;
        exit;
    End;
    if ( aSign <> 0) then
    Begin
        if ( ( bits32(aExp) OR  aSig ) = 0 ) then
        Begin
           float32_sqrt := a;
           exit;
        End;
        float_raise( float_flag_invalid );
        float32_sqrt.float32 := float32_default_nan;
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        if ( aSig = 0 ) then
        Begin
           float32_sqrt.float32 := 0;
           exit;
        End;
        normalizeFloat32Subnormal( aSig, aExp, aSig );
    End;
    zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
    aSig := ( aSig OR  $00800000 ) shl 8;
    zSig := estimateSqrt32( aExp, aSig ) + 2;
    if ( ( zSig and $7F ) <= 5 ) then
    Begin
        if ( zSig < 2 ) then
        Begin
            zSig := $7FFFFFFF;
            goto roundAndPack;
        End
        else
        Begin
            aSig  := aSig shr (aExp and 1);
            mul32To64( zSig, zSig, term0, term1 );
            sub64( aSig, 0, term0, term1, rem0, rem1 );
            while ( sbits32 (rem0) < 0 ) do
            Begin
                Dec(zSig);
                shortShift64Left( 0, zSig, 1, term0, term1 );
                term1 := term1 or 1;
                add64( rem0, rem1, term0, term1, rem0, rem1 );
            End;
            zSig := zSig OR bits32( ( rem0 OR  rem1 ) <> 0 );
        End;
    End;
    shift32RightJamming( zSig, 1, zSig );
 roundAndPack:
    float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
Begin
    if ((( extractFloat32Exp( a.float32 ) = $FF ) AND  (extractFloat32Frac( a.float32 )<>0))
         OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND  (extractFloat32Frac( b.float32 )<>0) )
       ) then
    Begin
        if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
        Begin
            float_raise( float_flag_invalid );
        End;
        float32_eq := 0;
        exit;
    End;
    float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR  b.float32 ) shl 1 ) = 0 );
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is less than
or equal to the corresponding value `b', and 0 otherwise.  The comparison
is performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_le( a: float32rec; b : float32rec ):flag;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
var
    aSign, bSign: flag;
Begin

    if (    ( ( extractFloat32Exp( a.float32 ) = $FF ) AND  (extractFloat32Frac( a.float32 )<>0) )
         OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND  (extractFloat32Frac( b.float32 )<>0) )
       ) then
    Begin
        float_raise( float_flag_invalid );
        float32_le := 0;
        exit;
    End;
    aSign := extractFloat32Sign( a.float32 );
    bSign := extractFloat32Sign( b.float32 );
    if ( aSign <> bSign ) then
    Begin
       float32_le :=  aSign OR flag( bits32 ( ( a.float32 OR  b.float32 ) shl 1 ) = 0 );
       exit;
    End;
    float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));

End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
var
    aSign, bSign: flag;
Begin

    if (    ( ( extractFloat32Exp( a.float32 ) = $FF ) AND  (extractFloat32Frac( a.float32 ) <>0))
         OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND  (extractFloat32Frac( b.float32 ) <>0) )
       ) then
    Begin
        float_raise( float_flag_invalid );
        float32_lt :=0;
        exit;
    End;
    aSign := extractFloat32Sign( a.float32 );
    bSign := extractFloat32Sign( b.float32 );
    if ( aSign <> bSign ) then
    Begin
       float32_lt := aSign AND  flag( bits32 ( ( a.float32 OR  b.float32 ) shl 1 ) <> 0 );
       exit;
    End;
    float32_lt := flag(flag( a.float32 <> b.float32 ) AND  flag( aSign xor flag( a.float32 < b.float32 ) ));

End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise.  The invalid exception is
raised if either operand is a NaN.  Otherwise, the comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_eq_signaling( a: float32; b: float32) : flag;
Begin

    if (    ( ( extractFloat32Exp( a ) = $FF ) AND  (extractFloat32Frac( a ) <> 0))
         OR ( ( extractFloat32Exp( b ) = $FF ) AND  (extractFloat32Frac( b ) <> 0))
       ) then
    Begin
        float_raise( float_flag_invalid );
        float32_eq_signaling := 0;
        exit;
    End;
    float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR  b ) shl 1 ) = 0 ));
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is less than or
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
cause an exception.  Otherwise, the comparison is performed according to the
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_le_quiet( a: float32 ; b : float32 ): flag;
Var
    aSign, bSign: flag;
Begin
    if (    ( ( extractFloat32Exp( a ) = $FF ) AND  (extractFloat32Frac( a )<>0) )
         OR ( ( extractFloat32Exp( b ) = $FF ) AND  (extractFloat32Frac( b )<>0) )
       ) then
    Begin
        if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
        Begin
            float_raise( float_flag_invalid );
        End;
        float32_le_quiet := 0;
        exit;
    End;
    aSign := extractFloat32Sign( a );
    bSign := extractFloat32Sign( b );
    if ( aSign <> bSign ) then
    Begin
       float32_le_quiet := aSign OR flag( bits32 ( ( a OR  b ) shl 1 ) = 0 );
       exit;
    End;
    float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the single-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
Var
   aSign, bSign: flag;
Begin
    if (    ( ( extractFloat32Exp( a ) = $FF ) AND  (extractFloat32Frac( a )<>0) )
         OR ( ( extractFloat32Exp( b ) = $FF ) AND  (extractFloat32Frac( b )<>0) )
       ) then
    Begin
        if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
        Begin
            float_raise( float_flag_invalid );
        End;
        float32_lt_quiet := 0;
        exit;
    End;
    aSign := extractFloat32Sign( a );
    bSign := extractFloat32Sign( b );
    if ( aSign <> bSign ) then
    Begin
        float32_lt_quiet := aSign AND  flag( bits32 ( ( a OR  b ) shl 1 ) <> 0 );
        exit;
    End;
    float32_lt_quiet := flag(flag( a <> b ) AND  ( aSign xor flag( a < b ) ));
End;

{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic---which means in particular that the conversion is rounded
according to the current rounding mode.  If `a' is a NaN, the largest
positive integer is returned.  Otherwise, if the conversion overflows, the
largest integer with the same sign as `a' is returned.
-------------------------------------------------------------------------------
*}
Function float64_to_int32(a: float64): int32;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig0, aSig1, absZ, aSigExtra: bits32;
    z: int32;
    roundingMode: TFPURoundingMode;
    label invalid;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    shiftCount := aExp - $413;
    if ( 0 <= shiftCount ) then
    Begin
        if ( $41E < aExp ) then
        Begin
            if ( ( aExp = $7FF ) AND  (( aSig0 OR  aSig1 )<>0) ) then
               aSign := 0;
            goto invalid;
        End;
        shortShift64Left(
            aSig0 OR  $00100000, aSig1, shiftCount, absZ, aSigExtra );
        if ( $80000000 < absZ ) then
          goto invalid;
    End
    else
    Begin
        aSig1 := flag( aSig1 <> 0 );
        if ( aExp < $3FE ) then
        Begin
            aSigExtra := aExp OR  aSig0 OR  aSig1;
            absZ := 0;
        End
        else
        Begin
            aSig0 := aSig0 OR $00100000;
            aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR  aSig1;
            absZ := aSig0 shr ( - shiftCount );
        End;
    End;
    roundingMode := softfloat_rounding_mode;
    if ( roundingMode = float_round_nearest_even ) then
    Begin
        if ( sbits32(aSigExtra) < 0 ) then
        Begin
            Inc(absZ);
            if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
               absZ :=  absZ and not 1;
        End;
        if aSign <> 0 then
          z := - absZ
        else
          z := absZ;
    End
    else
    Begin
        aSigExtra := bits32( aSigExtra <> 0 );
        if ( aSign <> 0) then
        Begin
            z := - (   absZ
                    + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
        End
        else
        Begin
            z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
        End
    End;
    if ( (( aSign xor flag( z < 0 ) )<>0) AND  (z<>0) ) then
    Begin
 invalid:
        float_raise( float_flag_invalid );
        if (aSign <> 0 ) then
          float64_to_int32 := sbits32 ($80000000)
        else
          float64_to_int32 :=  $7FFFFFFF;
        exit;
    End;
    if ( aSigExtra <> 0) then
      set_inexact_flag;
    float64_to_int32 := z;
End;


{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the 32-bit two's complement integer format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic, except that the conversion is always rounded toward zero.
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
the conversion overflows, the largest integer with the same sign as `a' is
returned.
-------------------------------------------------------------------------------
*}
Function float64_to_int32_round_to_zero(a: float64 ): int32;
{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
Var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig0, aSig1, absZ, aSigExtra: bits32;
    z: int32;
    label invalid;
 Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    shiftCount := aExp - $413;
    if ( 0 <= shiftCount ) then
    Begin
        if ( $41E < aExp ) then
        Begin
            if ( ( aExp = $7FF ) AND  (( aSig0 OR  aSig1 )<>0) ) then
               aSign := 0;
            goto invalid;
        End;
        shortShift64Left(
            aSig0 OR  $00100000, aSig1, shiftCount, absZ, aSigExtra );
    End
    else
    Begin
        if ( aExp < $3FF ) then
        Begin
            if ( bits32(aExp) OR  aSig0 OR  aSig1 )<>0 then
            Begin
              set_inexact_flag;
            End;
            float64_to_int32_round_to_zero := 0;
            exit;
        End;
        aSig0 := aSig0 or $00100000;
        aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR  aSig1;
        absZ := aSig0 shr ( - shiftCount );
    End;
    if aSign <> 0 then
      z := - absZ
    else
      z := absZ;
    if ( (( aSign xor flag( z < 0 )) <> 0) AND  (z<>0) ) then
    Begin
 invalid:
        float_raise( float_flag_invalid );
        if (aSign <> 0) then
          float64_to_int32_round_to_zero := sbits32 ($80000000)
        else
          float64_to_int32_round_to_zero :=  $7FFFFFFF;
        exit;
    End;
    if ( aSigExtra <> 0) then
       set_inexact_flag;
    float64_to_int32_round_to_zero := z;
 End;

{*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the 64-bit two's complement integer format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic---which means in particular that the conversion is rounded
| according to the current rounding mode.  If `a' is a NaN, the largest
| positive integer is returned.  Otherwise, if the conversion overflows, the
| largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*}

function float64_to_int64( a: float64 ): int64;
var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig, aSigExtra: bits64;
begin
    aSig := extractFloat64Frac( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
    shiftCount := $433 - aExp;
    if ( shiftCount <= 0 ) then begin
        if ( $43E < aExp ) then begin
            float_raise( float_flag_invalid );
            if (    ( aSign = 0 )
                 or (    ( aExp = $7FF )
                     and ( aSig <> $0010000000000000 ) )
               ) then begin
                result := $7FFFFFFFFFFFFFFF;
                exit;
            end;
            result := $8000000000000000;
            exit;
        end;
        aSigExtra := 0;
        aSig := aSig shl ( - shiftCount );
    end
    else
        shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
    result := roundAndPackInt64( aSign, aSig, aSigExtra );
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the 64-bit two's complement integer format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic, except that the conversion is always rounded toward zero.
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
| the conversion overflows, the largest integer with the same sign as `a' is
| returned.
*----------------------------------------------------------------------------*}

{$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
function float64_to_int64_round_to_zero( a: float64 ): int64;
var
    aSign: flag;
    aExp, shiftCount: int16;
    aSig: bits64;
    z: int64;
begin
    aSig := extractFloat64Frac( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
    shiftCount := aExp - $433;
    if ( 0 <= shiftCount ) then begin
        if ( $43E <= aExp ) then begin
            if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
                float_raise( float_flag_invalid );
                if (    ( aSign = 0 )
                     or (    ( aExp = $7FF )
                         and ( aSig <> $0010000000000000 ) )
                   ) then begin
                    result := $7FFFFFFFFFFFFFFF;
                    exit;
                end;
            end;
            result := $8000000000000000;
            exit;
        end;
        z := aSig shl shiftCount;
    end
    else begin
        if ( aExp < $3FE ) then begin
            if ( aExp or aSig <> 0 ) then set_inexact_flag;
            result := 0;
            exit;
        end;
        z := aSig shr ( - shiftCount );
        if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
          set_inexact_flag;
    end;
    if ( aSign <> 0 ) then z := - z;
    result := z;
end;

{*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the single-precision floating-point format.  The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_to_float32(a: float64 ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
Var
    aSign: flag;
    aExp: int16;
    aSig0, aSig1, zSig: bits32;
    allZero: bits32;
    tmp : CommonNanT;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    if ( aExp = $7FF ) then
    Begin
        if ( aSig0 OR  aSig1 ) <> 0 then
        Begin
            tmp:=float64ToCommonNaN(a);
            float64_to_float32.float32 := commonNaNToFloat32( tmp );
            exit;
        End;
        float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
        exit;
    End;
    shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
    if ( aExp <> 0) then
      zSig := zSig OR $40000000;
    float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
End;

{$ifdef FPC_SOFTFLOAT_FLOATX80}

{*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the extended double-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function float64_to_floatx80( a: float64 ): floatx80;
var
    aSign: flag;
    aExp: int16;
    aSig: bits64;
begin
    aSig := extractFloat64Frac( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    if ( aExp = $7FF ) then begin
        if ( aSig <> 0 ) then begin
            result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
            exit;
        end;
        result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( aSig = 0 ) then begin
            result := packFloatx80( aSign, 0, 0 );
            exit;
        end;
        normalizeFloat64Subnormal( aSig, aExp, aSig );
    end;
    result :=
        packFloatx80(
            aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
end;

{$endif FPC_SOFTFLOAT_FLOATX80}

{*
-------------------------------------------------------------------------------
Rounds the double-precision floating-point value `a' to an integer,
and returns the result as a double-precision floating-point value.  The
operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
function float64_round_to_int(a: float64) : Float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}

Var
    aSign: flag;
    aExp: int16;
    lastBitMask, roundBitsMask: bits32;
    roundingMode: TFPURoundingMode;
    z: float64;
Begin
    aExp := extractFloat64Exp( a );
    if ( $413 <= aExp ) then
    Begin
        if ( $433 <= aExp ) then
        Begin
            if (    ( aExp = $7FF )
                 AND
            (
            ( extractFloat64Frac0( a ) OR  extractFloat64Frac1( a )
            ) <>0)
            )  then
            Begin
                propagateFloat64NaN( a, a, result );
                exit;
            End;
            result := a;
            exit;
        End;
        lastBitMask := 1;
        lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
        roundBitsMask := lastBitMask - 1;
        z := a;
        roundingMode := softfloat_rounding_mode;
        if ( roundingMode = float_round_nearest_even ) then
        Begin
            if ( lastBitMask <> 0) then
            Begin
                add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
                if ( ( z.low and roundBitsMask ) = 0 ) then
                   z.low := z.low and not lastBitMask;
            End
            else
            Begin
                if ( sbits32 (z.low) < 0 ) then
                Begin
                    Inc(z.high);
                    if ( bits32 ( z.low shl 1 ) = 0 ) then
                      z.high := z.high and not 1;
                End;
            End;
        End
        else if ( roundingMode <> float_round_to_zero ) then
        Begin
            if (   extractFloat64Sign( z )
                 xor flag( roundingMode = float_round_up ) )<> 0 then
            Begin
                add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
            End;
        End;
        z.low := z.low and not roundBitsMask;
    End
    else
    Begin
        if ( aExp <= $3FE ) then
        Begin
            if ( ( ( bits32 ( a.high shl 1 ) ) OR  a.low ) = 0 ) then
            Begin
                result := a;
                exit;
            End;
            set_inexact_flag;
            aSign := extractFloat64Sign( a );
            case ( softfloat_rounding_mode ) of
             float_round_nearest_even:
               Begin
                if (    ( aExp = $3FE )
                     AND  ( (extractFloat64Frac0( a ) OR  extractFloat64Frac1( a ) )<>0)
                   ) then
                Begin
                    packFloat64( aSign, $3FF, 0, 0, result );
                    exit;
                End;

               End;
               float_round_down:
                Begin
                  if aSign<>0 then
                   packFloat64( 1, $3FF, 0, 0, result )
                  else
                   packFloat64( 0, 0, 0, 0, result );
                  exit;
                End;
             float_round_up:
                Begin
                  if aSign <> 0 then
                   packFloat64( 1, 0, 0, 0, result )
                  else
                   packFloat64( 0, $3FF, 0, 0, result );
                  exit;
                End;
            end;
            packFloat64( aSign, 0, 0, 0, result );
            exit;
        End;
        lastBitMask := 1;
        lastBitMask := lastBitMask shl ($413 - aExp);
        roundBitsMask := lastBitMask - 1;
        z.low := 0;
        z.high := a.high;
        roundingMode := softfloat_rounding_mode;
        if ( roundingMode = float_round_nearest_even ) then
        Begin
            z.high := z.high + lastBitMask shr 1;
            if ( ( ( z.high and roundBitsMask ) OR  a.low ) = 0 ) then
            Begin
                z.high := z.high and not lastBitMask;
            End;
        End
        else if ( roundingMode <> float_round_to_zero ) then
        Begin
            if (   extractFloat64Sign( z )
                 xor flag( roundingMode = float_round_up ) )<> 0 then
            Begin
                z.high := z.high or bits32( a.low <> 0 );
                z.high := z.high + roundBitsMask;
            End;
        End;
        z.high := z.high and not roundBitsMask;
    End;
    if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
    Begin
       set_inexact_flag;
    End;
    result := z;
End;


{*
-------------------------------------------------------------------------------
Returns the result of adding the absolute values of the double-precision
floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
before being returned.  `zSign' is ignored if the result is a NaN.
The addition is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
Var
    aExp, bExp, zExp: int16;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
    expDiff: int16;
    label shiftRight1;
    label roundAndPack;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    bSig1 := extractFloat64Frac1( b );
    bSig0 := extractFloat64Frac0( b );
    bExp := extractFloat64Exp( b );
    expDiff := aExp - bExp;
    if ( 0 < expDiff ) then
    Begin
        if ( aExp = $7FF ) then
        Begin
            if ( aSig0 OR  aSig1 ) <> 0 then
            Begin
              propagateFloat64NaN( a, b, out );
              exit;
            end;
            out := a;
            exit;
        End;
        if ( bExp = 0 ) then
        Begin
            Dec(expDiff);
        End
        else
        Begin
            bSig0 := bSig0 or $00100000;
        End;
        shift64ExtraRightJamming(
            bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
        zExp := aExp;
    End
    else if ( expDiff < 0 ) then
    Begin
        if ( bExp = $7FF ) then
        Begin
            if ( bSig0 OR  bSig1 ) <> 0 then
            Begin
               propagateFloat64NaN( a, b, out );
               exit;
            End;
            packFloat64( zSign, $7FF, 0, 0, out );
            exit;
        End;
        if ( aExp = 0 ) then
        Begin
            Inc(expDiff);
        End
        else
        Begin
            aSig0 := aSig0 or $00100000;
        End;
        shift64ExtraRightJamming(
            aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
        zExp := bExp;
    End
    else
    Begin
        if ( aExp = $7FF ) then
        Begin
            if ( aSig0 OR  aSig1 OR  bSig0 OR  bSig1 ) <> 0 then
            Begin
                propagateFloat64NaN( a, b, out );
                exit;
            End;
            out := a;
            exit;
        End;
        add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
        if ( aExp = 0 ) then
        Begin
           packFloat64( zSign, 0, zSig0, zSig1, out );
           exit;
        End;
        zSig2 := 0;
        zSig0 := zSig0 or $00200000;
        zExp := aExp;
        goto shiftRight1;
    End;
    aSig0 := aSig0 or $00100000;
    add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
    Dec(zExp);
    if ( zSig0 < $00200000 ) then
       goto roundAndPack;
    Inc(zExp);
 shiftRight1:
    shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
 roundAndPack:
    roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );

End;

{*
-------------------------------------------------------------------------------
Returns the result of subtracting the absolute values of the double-
precision floating-point values `a' and `b'.  If `zSign' is 1, the
difference is negated before being returned.  `zSign' is ignored if the
result is a NaN.  The subtraction is performed according to the IEC/IEEE
Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
Var
    aExp, bExp, zExp: int16;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
    expDiff: int16;
    z: float64;
    label aExpBigger;
    label bExpBigger;
    label aBigger;
    label bBigger;
    label normalizeRoundAndPack;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    bSig1 := extractFloat64Frac1( b );
    bSig0 := extractFloat64Frac0( b );
    bExp := extractFloat64Exp( b );
    expDiff := aExp - bExp;
    shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
    shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
    if ( 0 < expDiff ) then goto aExpBigger;
    if ( expDiff < 0 ) then goto bExpBigger;
    if ( aExp = $7FF ) then
    Begin
        if ( aSig0 OR  aSig1 OR  bSig0 OR  bSig1 ) <> 0 then
        Begin
            propagateFloat64NaN( a, b, out );
            exit;
        End;
        float_raise( float_flag_invalid );
        z.low := float64_default_nan_low;
        z.high := float64_default_nan_high;
        out := z;
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        aExp := 1;
        bExp := 1;
    End;
    if ( bSig0 < aSig0 ) then goto aBigger;
    if ( aSig0 < bSig0 ) then goto bBigger;
    if ( bSig1 < aSig1 ) then goto aBigger;
    if ( aSig1 < bSig1 ) then goto bBigger;
    packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
    exit;
 bExpBigger:
    if ( bExp = $7FF ) then
    Begin
        if ( bSig0 OR  bSig1 ) <> 0 then
        Begin
           propagateFloat64NaN( a, b, out );
           exit;
        End;
        packFloat64( zSign xor 1, $7FF, 0, 0, out );
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        Inc(expDiff);
    End
    else
    Begin
        aSig0 := aSig0 or $40000000;
    End;
    shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
    bSig0 := bSig0 or $40000000;
 bBigger:
    sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
    zExp := bExp;
    zSign := zSign xor 1;
    goto normalizeRoundAndPack;
 aExpBigger:
    if ( aExp = $7FF ) then
    Begin
        if ( aSig0 OR  aSig1 ) <> 0 then
        Begin
           propagateFloat64NaN( a, b, out );
           exit;
        End;
        out :=  a;
        exit;
    End;
    if ( bExp = 0 ) then
    Begin
        Dec(expDiff);
    End
    else
    Begin
        bSig0 := bSig0 or $40000000;
    End;
    shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
    aSig0 := aSig0 or $40000000;
 aBigger:
    sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
    zExp := aExp;
 normalizeRoundAndPack:
    Dec(zExp);
    normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );

End;

{*
-------------------------------------------------------------------------------
Returns the result of adding the double-precision floating-point values `a'
and `b'.  The operation is performed according to the IEC/IEEE Standard for
Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_add( a: float64; b : float64) : Float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
Var
    aSign, bSign: flag;
Begin
    aSign := extractFloat64Sign( a );
    bSign := extractFloat64Sign( b );
    if ( aSign = bSign ) then
    Begin
         addFloat64Sigs( a, b, aSign, result );
    End
    else
    Begin
        subFloat64Sigs( a, b, aSign, result );
    End;
End;

{*
-------------------------------------------------------------------------------
Returns the result of subtracting the double-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_sub(a: float64; b : float64) : Float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
Var
    aSign, bSign: flag;
Begin
    aSign := extractFloat64Sign( a );
    bSign := extractFloat64Sign( b );
    if ( aSign = bSign ) then
    Begin
        subFloat64Sigs( a, b, aSign, result );
    End
    else
    Begin
        addFloat64Sigs( a, b, aSign, result );
    End;
End;

{*
-------------------------------------------------------------------------------
Returns the result of multiplying the double-precision floating-point values
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_mul( a: float64; b:float64) : Float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
Var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int16;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
    z: float64;
    label invalid;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    bSig1 := extractFloat64Frac1( b );
    bSig0 := extractFloat64Frac0( b );
    bExp := extractFloat64Exp( b );
    bSign := extractFloat64Sign( b );
    zSign := aSign xor bSign;
    if ( aExp = $7FF ) then
    Begin
        if (    (( aSig0 OR  aSig1 ) <>0)
             OR ( ( bExp = $7FF ) AND  (( bSig0 OR  bSig1 )<>0) ) ) then
        Begin
            propagateFloat64NaN( a, b, result );
            exit;
        End;
        if ( ( bits32(bExp) OR  bSig0 OR  bSig1 ) = 0 ) then goto invalid;
        packFloat64( zSign, $7FF, 0, 0, result );
        exit;
    End;
    if ( bExp = $7FF ) then
    Begin
        if ( bSig0 OR  bSig1 )<> 0 then
        Begin
          propagateFloat64NaN( a, b, result );
          exit;
        End;
        if ( ( aExp OR  aSig0 OR  aSig1 ) = 0 ) then
        Begin
 invalid:
            float_raise( float_flag_invalid );
            z.low := float64_default_nan_low;
            z.high := float64_default_nan_high;
            result := z;
            exit;
        End;
        packFloat64( zSign, $7FF, 0, 0, result );
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        if ( ( aSig0 OR  aSig1 ) = 0 ) then
        Begin
           packFloat64( zSign, 0, 0, 0, result );
           exit;
        End;
        normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    End;
    if ( bExp = 0 ) then
    Begin
        if ( ( bSig0 OR  bSig1 ) = 0 ) then
        Begin
          packFloat64( zSign, 0, 0, 0, result );
          exit;
        End;
        normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
    End;
    zExp := aExp + bExp - $400;
    aSig0 := aSig0 or $00100000;
    shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
    mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
    add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
    zSig2 := zSig2 or flag( zSig3 <> 0 );
    if ( $00200000 <= zSig0 ) then
    Begin
        shift64ExtraRightJamming(
            zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
        Inc(zExp);
    End;
    roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
End;

{*
-------------------------------------------------------------------------------
Returns the result of dividing the double-precision floating-point value `a'
by the corresponding value `b'.  The operation is performed according to the
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_div(a: float64; b : float64) : Float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
Var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int16;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
    rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
    z: float64;
    label invalid;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    bSig1 := extractFloat64Frac1( b );
    bSig0 := extractFloat64Frac0( b );
    bExp := extractFloat64Exp( b );
    bSign := extractFloat64Sign( b );
    zSign := aSign xor bSign;
    if ( aExp = $7FF ) then
    Begin
        if ( aSig0 OR  aSig1 )<> 0 then
        Begin
           propagateFloat64NaN( a, b, result );
           exit;
        end;
        if ( bExp = $7FF ) then
        Begin
            if ( bSig0 OR  bSig1 )<>0 then
            Begin
               propagateFloat64NaN( a, b, result );
               exit;
            End;
            goto invalid;
        End;
        packFloat64( zSign, $7FF, 0, 0, result );
        exit;
    End;
    if ( bExp = $7FF ) then
    Begin
        if ( bSig0 OR  bSig1 )<> 0 then
        Begin
          propagateFloat64NaN( a, b, result );
          exit;
        End;
        packFloat64( zSign, 0, 0, 0, result );
        exit;
    End;
    if ( bExp = 0 ) then
    Begin
        if ( ( bSig0 OR  bSig1 ) = 0 ) then
        Begin
            if ( ( bits32(aExp) OR  aSig0 OR  aSig1 ) = 0 ) then
            Begin
 invalid:
                float_raise( float_flag_invalid );
                z.low := float64_default_nan_low;
                z.high := float64_default_nan_high;
                result := z;
                exit;
            End;
            float_raise( float_flag_divbyzero );
            packFloat64( zSign, $7FF, 0, 0, result );
            exit;
        End;
        normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
    End;
    if ( aExp = 0 ) then
    Begin
        if ( ( aSig0 OR  aSig1 ) = 0 ) then
        Begin
           packFloat64( zSign, 0, 0, 0, result );
           exit;
        End;
        normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    End;
    zExp := aExp - bExp + $3FD;
    shortShift64Left( aSig0 OR  $00100000, aSig1, 11, aSig0, aSig1 );
    shortShift64Left( bSig0 OR  $00100000, bSig1, 11, bSig0, bSig1 );
    if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
    Begin
        shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
        Inc(zExp);
    End;
    zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
    mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
    sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
    while ( sbits32 (rem0) < 0 ) do
    Begin
        Dec(zSig0);
        add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
    End;
    zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
    if ( ( zSig1 and $3FF ) <= 4 ) then
    Begin
        mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
        sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
        while ( sbits32 (rem1) < 0 ) do
        Begin
            Dec(zSig1);
            add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
        End;
        zSig1 := zSig1 or flag( ( rem1 OR  rem2 OR  rem3 ) <> 0 );
    End;
    shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
    roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );

End;

{*
-------------------------------------------------------------------------------
Returns the remainder of the double-precision floating-point value `a'
with respect to the corresponding value `b'.  The operation is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_rem(a: float64; b : float64) : float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
Var
    aSign, zSign: flag;
    aExp, bExp, expDiff: int16;
    aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
    allZero, alternateASig0, alternateASig1, sigMean1: bits32;
    sigMean0: sbits32;
    z: float64;
    label invalid;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    bSig1 := extractFloat64Frac1( b );
    bSig0 := extractFloat64Frac0( b );
    bExp := extractFloat64Exp( b );
    if ( aExp = $7FF ) then
    Begin
        if ((( aSig0 OR  aSig1 )<>0)
             OR ( ( bExp = $7FF ) AND  (( bSig0 OR  bSig1 )<>0) ) ) then
        Begin
            propagateFloat64NaN( a, b, result );
            exit;
        End;
        goto invalid;
    End;
    if ( bExp = $7FF ) then
    Begin
        if ( bSig0 OR  bSig1 ) <> 0 then
        Begin
          propagateFloat64NaN( a, b, result );
          exit;
        End;
        result := a;
        exit;
    End;
    if ( bExp = 0 ) then
    Begin
        if ( ( bSig0 OR  bSig1 ) = 0 ) then
        Begin
 invalid:
            float_raise( float_flag_invalid );
            z.low := float64_default_nan_low;
            z.high := float64_default_nan_high;
            result := z;
            exit;
        End;
        normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
    End;
    if ( aExp = 0 ) then
    Begin
        if ( ( aSig0 OR  aSig1 ) = 0 ) then
        Begin
           result := a;
           exit;
        End;
        normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    End;
    expDiff := aExp - bExp;
    if ( expDiff < -1 ) then
    Begin
       result := a;
       exit;
    End;
    shortShift64Left(
        aSig0 OR  $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
    shortShift64Left( bSig0 OR  $00100000, bSig1, 11, bSig0, bSig1 );
    q := le64( bSig0, bSig1, aSig0, aSig1 );
    if ( q )<>0 then
       sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
    expDiff := expDiff - 32;
    while ( 0 < expDiff ) do
    Begin
        q := estimateDiv64To32( aSig0, aSig1, bSig0 );
        if 4 < q then
          q:= q - 4
        else
          q := 0;
        mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
        shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
        shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
        sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
        expDiff := expDiff - 29;
    End;
    if ( -32 < expDiff ) then
    Begin
        q := estimateDiv64To32( aSig0, aSig1, bSig0 );
        if 4 < q then
          q := q - 4
        else
          q := 0;
        q := q shr (- expDiff);
        shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
        expDiff := expDiff + 24;
        if ( expDiff < 0 ) then
        Begin
            shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
        End
        else
        Begin
            shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
        End;
        mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
        sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
    End
    else
    Begin
        shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
        shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
    End;
    Repeat
        alternateASig0 := aSig0;
        alternateASig1 := aSig1;
        Inc(q);
        sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
    Until not ( 0 <= sbits32 (aSig0) );
    add64(
        aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
    if (    ( sigMean0 < 0 )
         OR ( ( ( sigMean0 OR  sigMean1 ) = 0 ) AND  (( q AND 1 )<>0) ) ) then
    Begin
        aSig0 := alternateASig0;
        aSig1 := alternateASig1;
    End;
    zSign := flag( sbits32 (aSig0) < 0 );
    if ( zSign <> 0 ) then
       sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
    normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
End;

{*
-------------------------------------------------------------------------------
Returns the square root of the double-precision floating-point value `a'.
The operation is performed according to the IEC/IEEE Standard for Binary
Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
function float64_sqrt( a: float64 ): float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
Var
    aSign: flag;
    aExp, zExp: int16;
    aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
    rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
    label invalid;
Begin
    aSig1 := extractFloat64Frac1( a );
    aSig0 := extractFloat64Frac0( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    if ( aExp = $7FF ) then
    Begin
        if ( aSig0 OR  aSig1 ) <> 0 then
        Begin
           propagateFloat64NaN( a, a, result );
           exit;
        End;
        if ( aSign = 0) then
        Begin
          result := a;
          exit;
        End;
        goto invalid;
    End;
    if ( aSign <> 0 ) then
    Begin
        if ( ( bits32(aExp) OR  aSig0 OR  aSig1 ) = 0 ) then
        Begin
           result := a;
           exit;
        End;
 invalid:
        float_raise( float_flag_invalid );
        result.low := float64_default_nan_low;
        result.high := float64_default_nan_high;
        exit;
    End;
    if ( aExp = 0 ) then
    Begin
        if ( ( aSig0 OR  aSig1 ) = 0 ) then
        Begin
           packFloat64( 0, 0, 0, 0, result );
           exit;
        End;
        normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    End;
    zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
    aSig0 := aSig0 or $00100000;
    shortShift64Left( aSig0, aSig1, 11, term0, term1 );
    zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
    if ( zSig0 = 0 ) then
       zSig0 := $7FFFFFFF;
    doubleZSig0 := zSig0 + zSig0;
    shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
    mul32To64( zSig0, zSig0, term0, term1 );
    sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
    while ( sbits32 (rem0) < 0 ) do
    Begin
        Dec(zSig0);
        doubleZSig0 := doubleZSig0 - 2;
        add64( rem0, rem1, 0, doubleZSig0 OR  1, rem0, rem1 );
    End;
    zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
    if ( ( zSig1 and $1FF ) <= 5 ) then
    Begin
        if ( zSig1 = 0 ) then
           zSig1 := 1;
        mul32To64( doubleZSig0, zSig1, term1, term2 );
        sub64( rem1, 0, term1, term2, rem1, rem2 );
        mul32To64( zSig1, zSig1, term2, term3 );
        sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
        while ( sbits32 (rem1) < 0 ) do
        Begin
            Dec(zSig1);
            shortShift64Left( 0, zSig1, 1, term2, term3 );
            term3 := term3 or 1;
            term2 := term2 or doubleZSig0;
            add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
        End;
        zSig1 := zSig1 or bits32( ( rem1 OR  rem2 OR  rem3 ) <> 0 );
    End;
    shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
    roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_eq(a: float64; b: float64): flag;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
Begin
    if
         (
                ( extractFloat64Exp( a ) = $7FF )
            AND
                (
                    (extractFloat64Frac0( a )  OR  extractFloat64Frac1( a )) <>0
                )
         )
         OR (
                ( extractFloat64Exp( b ) = $7FF )
           AND  (
                    (extractFloat64Frac0( b ) OR  (extractFloat64Frac1( b )) <> 0
                )
           )
       ) then
    Begin
        if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
            float_raise( float_flag_invalid );
        float64_eq := 0;
        exit;
    End;
    float64_eq := flag(
           ( a.low = b.low )
        AND  (    ( a.high = b.high )
             OR (    ( a.low = 0 )
                  AND  ( bits32 ( ( a.high OR  b.high ) shl 1 ) = 0 ) )
           ));
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is less than
or equal to the corresponding value `b', and 0 otherwise.  The comparison
is performed according to the IEC/IEEE Standard for Binary Floating-Point
Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_le(a: float64;b: float64): flag;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
Var
    aSign, bSign: flag;
Begin
    if
         (
                ( extractFloat64Exp( a ) = $7FF )
            AND
                (
                    (extractFloat64Frac0( a )  OR  extractFloat64Frac1( a )) <>0
                )
         )
         OR (
                ( extractFloat64Exp( b ) = $7FF )
           AND  (
                    (extractFloat64Frac0( b ) OR  (extractFloat64Frac1( b )) <> 0
                )
           )
       ) then
    Begin
        float_raise( float_flag_invalid );
        float64_le := 0;
        exit;
    End;
    aSign := extractFloat64Sign( a );
    bSign := extractFloat64Sign( b );
    if ( aSign <> bSign ) then
    Begin
        float64_le := flag(
               (aSign <> 0)
            OR (    ( ( bits32 ( ( a.high OR  b.high ) shl 1 ) ) OR  a.low OR  b.low )
                 = 0 ));
        exit;
    End;
    if aSign <> 0 then
      float64_le := le64( b.high, b.low, a.high, a.low )
    else
      float64_le := le64( a.high, a.low, b.high, b.low );
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise.  The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_lt(a: float64;b: float64): flag;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
Var
    aSign, bSign: flag;
Begin
    if
         (
                ( extractFloat64Exp( a ) = $7FF )
            AND
                (
                    (extractFloat64Frac0( a )  OR  extractFloat64Frac1( a )) <>0
                )
         )
         OR (
                ( extractFloat64Exp( b ) = $7FF )
           AND  (
                    (extractFloat64Frac0( b ) OR  (extractFloat64Frac1( b )) <> 0
                )
           )
       ) then
    Begin
        float_raise( float_flag_invalid );
        float64_lt := 0;
        exit;
    End;
    aSign := extractFloat64Sign( a );
    bSign := extractFloat64Sign( b );
    if ( aSign <> bSign ) then
    Begin
        float64_lt := flag(
               (aSign <> 0)
            AND  (    ( ( bits32 ( ( a.high OR  b.high ) shl 1 ) ) OR  a.low OR  b.low )
                 <> 0 ));
        exit;
    End;
    if aSign <> 0 then
       float64_lt := lt64( b.high, b.low, a.high, a.low )
    else
       float64_lt := lt64( a.high, a.low, b.high, b.low );
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise.  The invalid exception is
raised if either operand is a NaN.  Otherwise, the comparison is performed
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_eq_signaling( a: float64; b: float64): flag;
Begin

    if
         (
                ( extractFloat64Exp( a ) = $7FF )
            AND
                (
                    (extractFloat64Frac0( a )  OR  extractFloat64Frac1( a )) <>0
                )
         )
         OR (
                ( extractFloat64Exp( b ) = $7FF )
           AND  (
                    (extractFloat64Frac0( b ) OR  (extractFloat64Frac1( b )) <> 0
                )
           )
       ) then
    Begin
        float_raise( float_flag_invalid );
        float64_eq_signaling := 0;
        exit;
    End;
    float64_eq_signaling := flag(
           ( a.low = b.low )
        AND  (    ( a.high = b.high )
             OR (    ( a.low = 0 )
                  AND  ( bits32 ( ( a.high OR  b.high ) shl 1 ) = 0 ) )
           ));
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is less than or
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
cause an exception.  Otherwise, the comparison is performed according to the
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_le_quiet(a: float64 ; b: float64 ): flag;
Var
    aSign, bSign : flag;
Begin
    if
         (
                ( extractFloat64Exp( a ) = $7FF )
            AND
                (
                    (extractFloat64Frac0( a )  OR  extractFloat64Frac1( a )) <>0
                )
         )
         OR (
                ( extractFloat64Exp( b ) = $7FF )
           AND  (
                    (extractFloat64Frac0( b ) OR  (extractFloat64Frac1( b )) <> 0
                )
           )
       ) then
    Begin
        if ( (float64_is_signaling_nan( a )<>0)  OR (float64_is_signaling_nan( b )<>0) ) then
            float_raise( float_flag_invalid );
        float64_le_quiet := 0;
        exit;
    End;
    aSign := extractFloat64Sign( a );
    bSign := extractFloat64Sign( b );
    if ( aSign <> bSign ) then
    Begin
     float64_le_quiet := flag
      ((aSign <> 0)
            OR (    ( ( bits32 ( ( a.high OR  b.high ) shl 1 ) ) OR  a.low OR  b.low )
                 = 0 ));
        exit;
    End;
    if aSign <> 0 then
      float64_le_quiet := le64( b.high, b.low, a.high, a.low )
    else
      float64_le_quiet := le64( a.high, a.low, b.high, b.low );
End;

{*
-------------------------------------------------------------------------------
Returns 1 if the double-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
Standard for Binary Floating-Point Arithmetic.
-------------------------------------------------------------------------------
*}
Function float64_lt_quiet(a: float64; b: float64 ): Flag;
Var
    aSign, bSign: flag;
Begin
    if
         (
                ( extractFloat64Exp( a ) = $7FF )
            AND
                (
                    (extractFloat64Frac0( a )  OR  extractFloat64Frac1( a )) <>0
                )
         )
         OR (
                ( extractFloat64Exp( b ) = $7FF )
           AND  (
                    (extractFloat64Frac0( b ) OR  (extractFloat64Frac1( b )) <> 0
                )
           )
       ) then
    Begin
        if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
            float_raise( float_flag_invalid );
        float64_lt_quiet := 0;
        exit;
    End;
    aSign := extractFloat64Sign( a );
    bSign := extractFloat64Sign( b );
    if ( aSign <> bSign ) then
    Begin
      float64_lt_quiet := flag(
               (aSign<>0)
            AND  (    ( ( bits32 ( ( a.high OR  b.high ) shl 1 ) ) OR  a.low OR  b.low )
                 <> 0 ));
        exit;
    End;
    If aSign <> 0 then
      float64_lt_quiet :=  lt64( b.high, b.low, a.high, a.low )
    else
      float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
End;


{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the single-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}
function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
var
    zSign : flag;
    absA : uint64;
    shiftCount: int8;
Begin
    if ( a = 0 ) then
      begin
       int64_to_float32.float32 := 0;
       exit;
      end;
    if a < 0 then
      zSign := flag(TRUE)
    else
      zSign := flag(FALSE);
    if zSign<>0 then
       absA := -a
    else
       absA := a;
    shiftCount := countLeadingZeros64( absA ) - 40;
    if ( 0 <= shiftCount ) then
      begin
        int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
      end
    else
       begin
        shiftCount := shiftCount + 7;
        if ( shiftCount < 0 ) then
            shift64RightJamming( absA, - shiftCount, absA )
        else
            absA := absA shl shiftCount;
        int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
      end;
End;

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the single-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
| Unisgned version.
*----------------------------------------------------------------------------*}
function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
var
    absA : uint64;
    shiftCount: int8;
Begin
    if ( a = 0 ) then
      begin
       qword_to_float32.float32 := 0;
       exit;
      end;
    absA := a;
    shiftCount := countLeadingZeros64( absA ) - 40;
    if ( 0 <= shiftCount ) then
      begin
        qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
      end
    else
       begin
        shiftCount := shiftCount + 7;
        if ( shiftCount < 0 ) then
            shift64RightJamming( absA, - shiftCount, absA )
        else
            absA := absA shl shiftCount;
        qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
      end;
End;


{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the double-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}
function qword_to_float64( a: qword ): float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
var
  shiftCount: int8;
Begin
  if ( a = 0 ) then
    result := packFloat64( 0, 0, 0 )
  else
    begin
      shiftCount := countLeadingZeros64(a) - 1;
      { numbers with <= 53 significant bits are converted exactly }
      if (shiftCount > 9) then
        result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
      else if (shiftCount>=0) then
        result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
      else
        begin
          { the only possible negative value is -1, in case bit 63 of 'a' is set }
          shift64RightJamming(a, 1, a);
          result := roundAndPackFloat64(0, $43d, a);
        end;
    end;
End;


{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the double-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}
function int64_to_float64( a: int64 ): float64;
{$ifdef FPC_IS_SYSTEM}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
Begin
  if ( a = 0 ) then
    result := packFloat64( 0, 0, 0 )
  else if (a = int64($8000000000000000)) then
    result := packFloat64( 1, $43e, 0 )
  else if (a < 0) then
    result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  else
    result := normalizeRoundAndPackFloat64( 0, $43c, a );
End;


{$ifdef FPC_SOFTFLOAT_FLOATX80}

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the extended double-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function int64_to_floatx80( a: int64 ): floatx80;
var
    zSign: flag;
    absA: uint64;
    shiftCount: int8;
begin
    if ( a = 0 ) then begin
        result := packFloatx80( 0, 0, 0 );
        exit;
    end;
    zSign := ord( a < 0 );
    if zSign <> 0 then absA := - a else absA := a;
    shiftCount := countLeadingZeros64( absA );
    result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the extended double-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
| Unsigned version.
*----------------------------------------------------------------------------*}

function qword_to_floatx80( a: qword ): floatx80;
var
    absA: bits64;
    shiftCount: int8;
begin
    if ( a = 0 ) then begin
        result := packFloatx80( 0, 0, 0 );
        exit;
    end;
    absA := a;
    shiftCount := countLeadingZeros64( absA );
    result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
end;

{$endif FPC_SOFTFLOAT_FLOATX80}

{$ifdef FPC_SOFTFLOAT_FLOAT128}

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a' to
| the quadruple-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function int64_to_float128( a: int64 ): float128;
var
    zSign: flag;
    absA: uint64;
    shiftCount: int8;
    zExp: int32;
    zSig0, zSig1: bits64;
begin
    if ( a = 0 ) then begin
        result := packFloat128( 0, 0, 0, 0 );
        exit;
    end;
    zSign := ord( a < 0 );
    if zSign <> 0 then absA := - a else absA := a;
    shiftCount := countLeadingZeros64( absA ) + 49;
    zExp := $406E - shiftCount;
    if ( 64 <= shiftCount ) then begin
        zSig1 := 0;
        zSig0 := absA;
        dec( shiftCount, 64 );
    end
    else begin
        zSig1 := absA;
        zSig0 := 0;
    end;
    shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
    result := packFloat128( zSign, zExp, zSig0, zSig1 );
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a' to
| the quadruple-precision floating-point format.  The conversion is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
| Unsigned version.
*----------------------------------------------------------------------------*}

function qword_to_float128( a: qword ): float128;
var
    absA: bits64;
    shiftCount: int8;
    zExp: int32;
    zSig0, zSig1: bits64;
begin
    if ( a = 0 ) then begin
        result := packFloat128( 0, 0, 0, 0 );
        exit;
    end;
    absA := a;
    shiftCount := countLeadingZeros64( absA ) + 49;
    zExp := $406E - shiftCount;
    if ( 64 <= shiftCount ) then begin
        zSig1 := 0;
        zSig0 := absA;
        dec( shiftCount, 64 );
    end
    else begin
        zSig1 := absA;
        zSig0 := 0;
    end;
    shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
    result := packFloat128( 0, zExp, zSig0, zSig1 );
end;

{$endif FPC_SOFTFLOAT_FLOAT128}

{*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
| Otherwise, returns 0.
*----------------------------------------------------------------------------*}

function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
begin
    result := ord(( a0 = b0 ) and ( a1 = b1 ));
end;

{*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
| Otherwise, returns 0.
*----------------------------------------------------------------------------*}

function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
begin
    result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
end;

{*----------------------------------------------------------------------------
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
| by 64 _plus_ the number of bits given in `count'.  The shifted result is
| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
| off form a third 64-bit result as follows:  The _last_ bit shifted off is
| the most-significant bit of the extra result, and the other 63 bits of the
| extra result are all zero if and only if _all_but_the_last_ bits shifted off
| were all zero.  This extra result is stored in the location pointed to by
| `z2Ptr'.  The value of `count' can be arbitrarily large.
|     (This routine makes more sense if `a0', `a1', and `a2' are considered
| to form a fixed-point value with binary point between `a1' and `a2'.  This
| fixed-point value is shifted right by the number of bits given in `count',
| and the integer part of the result is returned at the locations pointed to
| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
| corrupted as described above, and is returned at the location pointed to by
| `z2Ptr'.)
*----------------------------------------------------------------------------*}

procedure shift128ExtraRightJamming(
     a0: bits64;
     a1: bits64;
     a2: bits64;
     count: int16;
     var z0Ptr: bits64;
     var z1Ptr: bits64;
     var z2Ptr: bits64);
var
    z0, z1, z2: bits64;
    negCount: int8;
begin
    negCount := ( - count ) and 63;

    if ( count = 0 ) then
    begin
        z2 := a2;
        z1 := a1;
        z0 := a0;
    end
    else begin
        if ( count < 64 ) then
        begin
            z2 := a1 shl negCount;
            z1 := ( a0 shl negCount ) or  ( a1 shr count );
            z0 := a0 shr count;
        end
        else begin
            if ( count = 64 ) then
            begin
                z2 := a1;
                z1 := a0;
            end
            else begin
                a2 := a2 or a1;
                if ( count < 128 ) then
                begin
                    z2 := a0 shl negCount;
                    z1 := a0 shr ( count and 63 );
                end
                else begin
                	  if ( count = 128 ) then
                      z2 :=  a0
                    else
                      z2 := ord( a0 <> 0 );
                    z1 := 0;
                end;
            end;
            z0 := 0;
        end;
        z2 := z2 or ord( a2 <> 0 );
    end;
    z2Ptr := z2;
    z1Ptr := z1;
    z0Ptr := z0;

end;

{*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
| _plus_ the number of bits given in `count'.  The shifted result is at most
| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
| bits shifted off form a second 64-bit result as follows:  The _last_ bit
| shifted off is the most-significant bit of the extra result, and the other
| 63 bits of the extra result are all zero if and only if _all_but_the_last_
| bits shifted off were all zero.  This extra result is stored in the location
| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
|     (This routine makes more sense if `a0' and `a1' are considered to form
| a fixed-point value with binary point between `a0' and `a1'.  This fixed-
| point value is shifted right by the number of bits given in `count', and
| the integer part of the result is returned at the location pointed to by
| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
| described above, and is returned at the location pointed to by `z1Ptr'.)
*----------------------------------------------------------------------------*}

procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
var
    z0, z1: bits64;
    negCount: int8;
begin
    negCount := ( - count ) and 63;

    if ( count = 0 ) then
    begin
        z1 := a1;
        z0 := a0;
    end
    else if ( count < 64 ) then
    begin
        z1 := ( a0 shl negCount ) or ord( a1 <>  0 );
        z0 := a0 shr count;
    end
    else begin
        if ( count = 64 ) then
        begin
            z1 := a0 or ord( a1 <>  0 );
        end
        else begin
            z1 := ord( ( a0 or a1 ) <>  0 );
        end;
        z0 := 0;
    end;
    z1Ptr := z1;
    z0Ptr := z0;
end;

{$ifdef FPC_SOFTFLOAT_FLOATX80}

{*----------------------------------------------------------------------------
| Returns the fraction bits of the extended double-precision floating-point
| value `a'.
*----------------------------------------------------------------------------*}

function extractFloatx80Frac(a : floatx80): bits64;inline;
begin
    result:=a.low;
end;

{*----------------------------------------------------------------------------
| Returns the exponent bits of the extended double-precision floating-point
| value `a'.
*----------------------------------------------------------------------------*}

function extractFloatx80Exp(a : floatx80): int32;inline;
begin
    result:=a.high and $7FFF;
end;

{*----------------------------------------------------------------------------
| Returns the sign bit of the extended double-precision floating-point value
| `a'.
*----------------------------------------------------------------------------*}

function extractFloatx80Sign(a : floatx80): flag;inline;
begin
    result:=a.high shr 15;
end;

{*----------------------------------------------------------------------------
| Normalizes the subnormal extended double-precision floating-point value
| represented by the denormalized significand `aSig'.  The normalized exponent
| and significand are stored at the locations pointed to by `zExpPtr' and
| `zSigPtr', respectively.
*----------------------------------------------------------------------------*}

procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
var
    shiftCount: int8;
begin
    shiftCount := countLeadingZeros64( aSig );
    zSigPtr := aSig shl shiftCount;
    zExpPtr := 1 - shiftCount;
end;

{*----------------------------------------------------------------------------
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
| extended double-precision floating-point value, returning the result.
*----------------------------------------------------------------------------*}

function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
var
    z: floatx80;
begin
    z.low := zSig;
    z.high := (  bits16(zSign) shl 15 ) + zExp;
    result:=z;
end;

{*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
| and returns the proper extended double-precision floating-point value
| corresponding to the abstract input.  Ordinarily, the abstract value is
| rounded and packed into the extended double-precision format, with the
| inexact exception raised if the abstract input cannot be represented
| exactly.  However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned.  If the abstract value is too small, the input value is rounded to
| a subnormal number, and the underflow and inexact exceptions are raised if
| the abstract input cannot be represented exactly as a subnormal extended
| double-precision floating-point number.
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
| number of bits as single or double precision, respectively.  Otherwise, the
| result is rounded to the full precision of the extended double-precision
| format.
|     The input significand must be normalized or smaller.  If the input
| significand is not normalized, `zExp' must be 0; in that case, the result
| returned is a subnormal number, and it must not require rounding.  The
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
var
    roundingMode: TFPURoundingMode;
    roundNearestEven, increment, isTiny: flag;
    roundIncrement, roundMask, roundBits: int64;
label
    precision80, overflow;
begin
    roundingMode := softfloat_rounding_mode;
    roundNearestEven := flag( roundingMode = float_round_nearest_even );
    if ( roundingPrecision = 80 ) then
      goto precision80;
    if ( roundingPrecision = 64 ) then
    begin
        roundIncrement := int64( $0000000000000400 );
        roundMask := int64( $00000000000007FF );
    end
    else if ( roundingPrecision = 32 ) then
    begin
        roundIncrement := int64( $0000008000000000 );
        roundMask := int64( $000000FFFFFFFFFF );
    end
    else begin
        goto precision80;
    end;
    zSig0 := zSig0 or ord( zSig1 <> 0 );
    if ( not (roundNearestEven<>0) ) then
    begin
        if ( roundingMode = float_round_to_zero ) then
        begin
            roundIncrement := 0;
        end
        else begin
            roundIncrement := roundMask;
            if ( zSign<>0 ) then
            begin
                if ( roundingMode = float_round_up ) then
                  roundIncrement := 0;
            end
            else begin
                if ( roundingMode = float_round_down ) then
                  roundIncrement := 0;
            end;
        end;
    end;
    roundBits := zSig0 and roundMask;
    if ( $7FFD <= bits32( zExp - 1 ) ) then begin
        if (    ( $7FFE < zExp )
             or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
           ) then begin
            goto overflow;
        end;
        if ( zExp <= 0 ) then begin
            isTiny := ord (
                   ( softfloat_detect_tininess = float_tininess_before_rounding )
                or ( zExp < 0 )
                or ( zSig0 <= zSig0 + roundIncrement ) );
            shift64RightJamming( zSig0, 1 - zExp, zSig0 );
            zExp := 0;
            roundBits := zSig0 and roundMask;
            if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
            if ( roundBits <> 0 ) then set_inexact_flag;
            inc( zSig0, roundIncrement );
            if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
            roundIncrement := roundMask + 1;
            if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
                roundMask := roundMask or roundIncrement;
            end;
            zSig0 := zSig0 and not roundMask;
            result:=packFloatx80( zSign, zExp, zSig0 );
            exit;
        end;
    end;
    if ( roundBits <> 0 ) then set_inexact_flag;
    inc( zSig0, roundIncrement );
    if ( zSig0 < roundIncrement ) then begin
        inc(zExp);
        zSig0 := bits64( $8000000000000000 );
    end;
    roundIncrement := roundMask + 1;
    if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
        roundMask := roundMask or roundIncrement;
    end;
    zSig0 := zSig0 and not roundMask;
    if ( zSig0 = 0 ) then zExp := 0;
    result:=packFloatx80( zSign, zExp, zSig0 );
    exit;
 precision80:
    increment := ord ( sbits64( zSig1 ) < 0 );
    if ( roundNearestEven = 0 ) then begin
        if ( roundingMode = float_round_to_zero ) then begin
            increment := 0;
        end
        else begin
            if ( zSign <> 0 ) then begin
                increment := ord ( roundingMode = float_round_down ) and zSig1;
            end
            else begin
                increment := ord ( roundingMode = float_round_up ) and zSig1;
            end;
        end;
    end;
    if ( $7FFD <= bits32( zExp - 1 ) ) then begin
        if (    ( $7FFE < zExp )
             or (    ( zExp = $7FFE )
                  and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
                  and ( increment <> 0 )
                )
           ) then begin
            roundMask := 0;
 overflow:
            float_raise( [float_flag_overflow,float_flag_inexact] );
            if (    ( roundingMode = float_round_to_zero )
                 or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
                 or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
               ) then begin
                result:=packFloatx80( zSign, $7FFE, not roundMask );
                exit;
            end;
            result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
            exit;
        end;
        if ( zExp <= 0 ) then begin
            isTiny := ord(
                   ( softfloat_detect_tininess = float_tininess_before_rounding )
                or ( zExp < 0 )
                or ( increment = 0 )
                or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
            zExp := 0;
            if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
            if ( zSig1 <> 0 ) then set_inexact_flag;
            if ( roundNearestEven <> 0 ) then begin
                increment := ord( sbits64( zSig1 ) < 0 );
            end
            else begin
                if ( zSign <> 0 ) then begin
                    increment := ord( roundingMode = float_round_down ) and zSig1;
                end
                else begin
                    increment := ord( roundingMode = float_round_up ) and zSig1;
                end;
            end;
            if ( increment <> 0 ) then begin
                inc(zSig0);
                zSig0 :=
                    not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
                if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
            end;
            result:=packFloatx80( zSign, zExp, zSig0 );
            exit;
        end;
    end;
    if ( zSig1 <> 0 ) then set_inexact_flag;
    if ( increment <> 0 ) then begin
        inc(zSig0);
        if ( zSig0 = 0 ) then begin
            inc(zExp);
            zSig0 := bits64( $8000000000000000 );
        end
        else begin
            zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
        end;
    end
    else begin
        if ( zSig0 = 0 ) then zExp := 0;
    end;
    result:=packFloatx80( zSign, zExp, zSig0 );

end;

{*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
| and returns the proper extended double-precision floating-point value
| corresponding to the abstract input.  This routine is just like
| `roundAndPackFloatx80' except that the input significand does not have to be
| normalized.
*----------------------------------------------------------------------------*}

function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
var
    shiftCount: int8;
begin
    if ( zSig0 = 0 ) then begin
        zSig0 := zSig1;
        zSig1 := 0;
        dec( zExp, 64 );
    end;
    shiftCount := countLeadingZeros64( zSig0 );
    shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
    zExp := zExp - shiftCount;
    result :=
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 32-bit two's complement integer format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic---which means in particular that the conversion
| is rounded according to the current rounding mode.  If `a' is a NaN, the
| largest positive integer is returned.  Otherwise, if the conversion
| overflows, the largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*}

function floatx80_to_int32(a: floatx80): int32;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig: bits64;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
    shiftCount := $4037 - aExp;
    if ( shiftCount <= 0 ) then shiftCount := 1;
    shift64RightJamming( aSig, shiftCount, aSig );
    result := roundAndPackInt32( aSign, aSig );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 32-bit two's complement integer format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic, except that the conversion is always rounded
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
| Otherwise, if the conversion overflows, the largest integer with the same
| sign as `a' is returned.
*----------------------------------------------------------------------------*}

function floatx80_to_int32_round_to_zero(a: floatx80): int32;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig, savedASig: bits64;
    z: int32;
label
    invalid;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    if ( $401E < aExp ) then begin
        if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
        goto invalid;
    end
    else if ( aExp < $3FFF ) then begin
        if ( aExp or aSig <> 0 ) then set_inexact_flag;
        result := 0;
        exit;
    end;
    shiftCount := $403E - aExp;
    savedASig := aSig;
    aSig := aSig shr shiftCount;
    z := aSig;
    if ( aSign <> 0 ) then z := - z;
    if ( ord( z < 0 ) xor aSign ) <> 0 then begin
 invalid:
        float_raise( float_flag_invalid );
        if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
        exit;
    end;
    if ( ( aSig shl shiftCount ) <> savedASig ) then begin
        set_inexact_flag;
    end;
    result := z;

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 64-bit two's complement integer format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic---which means in particular that the conversion
| is rounded according to the current rounding mode.  If `a' is a NaN,
| the largest positive integer is returned.  Otherwise, if the conversion
| overflows, the largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*}

function floatx80_to_int64(a: floatx80): int64;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig, aSigExtra: bits64;
begin

    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    shiftCount := $403E - aExp;
    if ( shiftCount <= 0 ) then begin
        if ( shiftCount <> 0 ) then begin
            float_raise( float_flag_invalid );
            if (    ( aSign = 0 )
                 or (    ( aExp = $7FFF )
                      and ( aSig <> bits64( $8000000000000000 ) ) )
               ) then begin
                result := $7FFFFFFFFFFFFFFF;
                exit;
            end;
            result := $8000000000000000;
            exit;
        end;
        aSigExtra := 0;
    end
    else begin
        shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
    end;
    result := roundAndPackInt64( aSign, aSig, aSigExtra );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 64-bit two's complement integer format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic, except that the conversion is always rounded
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
| Otherwise, if the conversion overflows, the largest integer with the same
| sign as `a' is returned.
*----------------------------------------------------------------------------*}

function floatx80_to_int64_round_to_zero(a: floatx80): int64;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig: bits64;
    z: int64;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    shiftCount := aExp - $403E;
    if ( 0 <= shiftCount ) then begin
        aSig := $7FFFFFFFFFFFFFFF;
        if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
            float_raise( float_flag_invalid );
            if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
                result := $7FFFFFFFFFFFFFFF;
                exit;
            end;
        end;
        result := $8000000000000000;
        exit;
    end
    else if ( aExp < $3FFF ) then begin
        if ( aExp or aSig <> 0 ) then set_inexact_flag;
        result := 0;
        exit;
    end;
    z := aSig shr ( - shiftCount );
    if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
        set_inexact_flag;
    end;
    if ( aSign <> 0 ) then z := - z;
    result := z;

end;

{*----------------------------------------------------------------------------
| The pattern for a default generated extended double-precision NaN.  The
| `high' and `low' values hold the most- and least-significant bits,
| respectively.
*----------------------------------------------------------------------------*}
const
    floatx80_default_nan_high = $FFFF;
    floatx80_default_nan_low  = bits64( $C000000000000000 );

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is a
| signaling NaN; otherwise returns 0.
*----------------------------------------------------------------------------*}

function floatx80_is_signaling_nan(a : floatx80): flag;
var
    aLow: bits64;
begin
    aLow := a.low and not $4000000000000000;
    result := ord(
           ( a.high and $7FFF = $7FFF )
       and ( bits64( aLow shl 1 ) <> 0 )
       and ( a.low = aLow ) );
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point NaN `a' to the canonical NaN format.  If `a' is a signaling NaN, the
| invalid exception is raised.
*----------------------------------------------------------------------------*}

function floatx80ToCommonNaN(a : floatx80): commonNaNT;
var
    z: commonNaNT;
begin
    if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
    z.sign := a.high shr 15;
    z.low := 0;
    z.high := a.low shl 1;
    result := z;
end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is a
| NaN; otherwise returns 0.
*----------------------------------------------------------------------------*}

function floatx80_is_nan(a : floatx80 ): flag;
begin
    result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
end;

{*----------------------------------------------------------------------------
| Takes two extended double-precision floating-point values `a' and `b', one
| of which is a NaN, and returns the appropriate NaN result.  If either `a' or
| `b' is a signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*}

function propagateFloatx80NaN(a, b: floatx80): floatx80;
var
    aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
label
    returnLargerSignificand;
begin
    aIsNaN := floatx80_is_nan( a );
    aIsSignalingNaN := floatx80_is_signaling_nan( a );
    bIsNaN := floatx80_is_nan( b );
    bIsSignalingNaN := floatx80_is_signaling_nan( b );
    a.low := a.low or $C000000000000000;
    b.low := b.low or $C000000000000000;
    if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
    if aIsSignalingNaN <> 0 then begin
        if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
        if bIsNaN <> 0 then result := b else result := a;
        exit;
    end
    else if aIsNaN <>0 then begin
        if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
            result := a;
            exit;
        end;
 returnLargerSignificand:
        if ( a.low < b.low ) then begin
            result := b;
            exit;
        end;
        if ( b.low < a.low ) then begin
            result := a;
            exit;
        end;
        if a.high < b.high then result := a else result := b;
        exit;
    end
    else
        result := b;
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the single-precision floating-point format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_to_float32(a: floatx80): float32;
var
    aSign: flag;
    aExp: int32;
    aSig: bits64;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    if ( aExp = $7FFF ) then begin
        if bits64( aSig shl 1 ) <> 0 then begin
            result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
            exit;
        end;
        result := packFloat32( aSign, $FF, 0 );
        exit;
    end;
    shift64RightJamming( aSig, 33, aSig );
    if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
    result := roundAndPackFloat32( aSign, aExp, aSig );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the double-precision floating-point format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_to_float64(a: floatx80): float64;
var
    aSign: flag;
    aExp: int32;
    aSig, zSig: bits64;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    if ( aExp = $7FFF ) then begin
        if bits64( aSig shl 1 ) <> 0 then begin
            result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
            exit;
        end;
        result := packFloat64( aSign, $7FF, 0 );
        exit;
    end;
    shift64RightJamming( aSig, 1, zSig );
    if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
    result := roundAndPackFloat64( aSign, aExp, zSig );

end;

{$ifdef FPC_SOFTFLOAT_FLOAT128}

{*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the quadruple-precision floating-point format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_to_float128(a: floatx80): float128;
var
    aSign: flag;
    aExp: int16;
    aSig, zSig0, zSig1: bits64;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
        result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
        exit;
    end;
    shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
    result := packFloat128( aSign, aExp, zSig0, zSig1 );

end;

{$endif FPC_SOFTFLOAT_FLOAT128}

{*----------------------------------------------------------------------------
| Rounds the extended double-precision floating-point value `a' to an integer,
| and Returns the result as an extended quadruple-precision floating-point
| value.  The operation is performed according to the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_round_to_int(a: floatx80): floatx80;
var
    aSign: flag;
    aExp: int32;
    lastBitMask, roundBitsMask: bits64;
    roundingMode: TFPURoundingMode;
    z: floatx80;
begin
    aExp := extractFloatx80Exp( a );
    if ( $403E <= aExp ) then begin
        if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, a );
            exit;
        end;
        result := a;
        exit;
    end;
    if ( aExp < $3FFF ) then begin
        if (    ( aExp = 0 )
             and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
            result := a;
            exit;
        end;
        set_inexact_flag;
        aSign := extractFloatx80Sign( a );
        case softfloat_rounding_mode of
         float_round_nearest_even:
            if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
               ) then begin
                result :=
                    packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
                exit;
            end;
         float_round_down: begin
            if aSign <> 0 then
                result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
            else
                result := packFloatx80( 0, 0, 0 );
            exit;
            end;
         float_round_up: begin
            if aSign <> 0 then
                result := packFloatx80( 1, 0, 0 )
            else
                result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
            exit;
            end;
        end;
        result := packFloatx80( aSign, 0, 0 );
        exit;
    end;
    lastBitMask := 1;
    lastBitMask := lastBitMask shl ( $403E - aExp );
    roundBitsMask := lastBitMask - 1;
    z := a;
    roundingMode := softfloat_rounding_mode;
    if ( roundingMode = float_round_nearest_even ) then begin
        inc( z.low, lastBitMask shr 1 );
        if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
    end
    else if ( roundingMode <> float_round_to_zero ) then begin
        if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
            inc( z.low, roundBitsMask );
        end;
    end;
    z.low := z.low and not roundBitsMask;
    if ( z.low = 0 ) then begin
        inc(z.high);
        z.low := bits64( $8000000000000000 );
    end;
    if ( z.low <> a.low ) then set_inexact_flag;
    result := z;

end;

{*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the extended double-
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
| negated before being returned.  `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
var
    aExp, bExp, zExp: int32;
    aSig, bSig, zSig0, zSig1: bits64;
    expDiff: int32;
label
    shiftRight1, roundAndPack;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    bSig := extractFloatx80Frac( b );
    bExp := extractFloatx80Exp( b );
    expDiff := aExp - bExp;
    if ( 0 < expDiff ) then begin
        if ( aExp = $7FFF ) then begin
            if ( bits64( aSig shl 1 ) <> 0 ) then begin
                result := propagateFloatx80NaN( a, b );
                exit;
            end;
            result := a;
            exit;
        end;
        if ( bExp = 0 ) then dec(expDiff);
        shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
        zExp := aExp;
    end
    else if ( expDiff < 0 ) then begin
        if ( bExp = $7FFF ) then begin
            if ( bits64( bSig shl 1 ) <> 0 ) then begin
                result := propagateFloatx80NaN( a, b );
                exit;
            end;
            result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
            exit;
        end;
        if ( aExp = 0 ) then inc(expDiff);
        shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
        zExp := bExp;
    end
    else begin
        if ( aExp = $7FFF ) then begin
            if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
                result := propagateFloatx80NaN( a, b );
                exit;
            end;
            result := a;
            exit;
        end;
        zSig1 := 0;
        zSig0 := aSig + bSig;
        if ( aExp = 0 ) then begin
            normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
            goto roundAndPack;
        end;
        zExp := aExp;
        goto shiftRight1;
    end;
    zSig0 := aSig + bSig;
    if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
 shiftRight1:
    shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
    zSig0 := zSig0 or $8000000000000000;
    inc(zExp);
 roundAndPack:
    result :=
        roundAndPackFloatx80(
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the extended
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
| difference is negated before being returned.  `zSign' is ignored if the
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
var
    aExp, bExp, zExp: int32;
    aSig, bSig, zSig0, zSig1: bits64;
    expDiff: int32;
    z: floatx80;
label
    bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    bSig := extractFloatx80Frac( b );
    bExp := extractFloatx80Exp( b );
    expDiff := aExp - bExp;
    if ( 0 < expDiff ) then goto aExpBigger;
    if ( expDiff < 0 ) then goto bExpBigger;
    if ( aExp = $7FFF ) then begin
        if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        float_raise( float_flag_invalid );
        z.low := floatx80_default_nan_low;
        z.high := floatx80_default_nan_high;
        result := z;
        exit;
    end;
    if ( aExp = 0 ) then begin
        aExp := 1;
        bExp := 1;
    end;
    zSig1 := 0;
    if ( bSig < aSig ) then goto aBigger;
    if ( aSig < bSig ) then goto bBigger;
    result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
    exit;
 bExpBigger:
    if ( bExp = $7FFF ) then begin
        if ( bits64( bSig shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( aExp = 0 ) then inc(expDiff);
    shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
 bBigger:
    sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
    zExp := bExp;
    zSign := zSign xor  1;
    goto normalizeRoundAndPack;
 aExpBigger:
    if ( aExp = $7FFF ) then begin
        if ( bits64( aSig shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        result := a;
        exit;
    end;
    if ( bExp = 0 ) then dec(expDiff);
    shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
 aBigger:
    sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
    zExp := aExp;
 normalizeRoundAndPack:
    result :=
        normalizeRoundAndPackFloatx80(
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the result of adding the extended double-precision floating-point
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_add(a: floatx80; b: floatx80): floatx80;
var
    aSign, bSign: flag;
begin
    aSign := extractFloatx80Sign( a );
    bSign := extractFloatx80Sign( b );
    if ( aSign = bSign ) then begin
        result := addFloatx80Sigs( a, b, aSign );
    end
    else begin
        result := subFloatx80Sigs( a, b, aSign );
    end;

end;

{*----------------------------------------------------------------------------
| Returns the result of subtracting the extended double-precision floating-
| point values `a' and `b'.  The operation is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
var
    aSign, bSign: flag;
begin
    aSign := extractFloatx80Sign( a );
    bSign := extractFloatx80Sign( b );
    if ( aSign = bSign ) then begin
        result := subFloatx80Sigs( a, b, aSign );
    end
    else begin
        result := addFloatx80Sigs( a, b, aSign );
    end;

end;

{*----------------------------------------------------------------------------
| Returns the result of multiplying the extended double-precision floating-
| point values `a' and `b'.  The operation is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_mul(a: floatx80; b: floatx80): floatx80;
var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int32;
    aSig, bSig, zSig0, zSig1: bits64;
    z: floatx80;
label
    invalid;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    bSig := extractFloatx80Frac( b );
    bExp := extractFloatx80Exp( b );
    bSign := extractFloatx80Sign( b );
    zSign := aSign xor bSign;
    if ( aExp = $7FFF ) then begin
        if ( bits64( aSig shl 1 ) <> 0 )
             or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        if ( ( bExp or bSig ) = 0 ) then goto invalid;
        result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( bExp = $7FFF ) then begin
        if ( bits64( bSig shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        if ( ( aExp or aSig ) = 0 ) then begin
 invalid:
            float_raise( float_flag_invalid );
            z.low := floatx80_default_nan_low;
            z.high := floatx80_default_nan_high;
            result := z;
            exit;
        end;
        result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( aSig = 0 ) then begin
            result := packFloatx80( zSign, 0, 0 );
            exit;
        end;
        normalizeFloatx80Subnormal( aSig, aExp, aSig );
    end;
    if ( bExp = 0 ) then begin
        if ( bSig = 0 ) then begin
            result := packFloatx80( zSign, 0, 0 );
            exit;
        end;
        normalizeFloatx80Subnormal( bSig, bExp, bSig );
    end;
    zExp := aExp + bExp - $3FFE;
    mul64To128( aSig, bSig, zSig0, zSig1 );
    if 0 < sbits64( zSig0 ) then begin
        shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
        dec(zExp);
    end;
    result :=
        roundAndPackFloatx80(
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the result of dividing the extended double-precision floating-point
| value `a' by the corresponding value `b'.  The operation is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int32;
    aSig, bSig, zSig0, zSig1: bits64;
    rem0, rem1, rem2, term0, term1, term2: bits64;
    z: floatx80;
label
    invalid;
begin
    aSig := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    bSig := extractFloatx80Frac( b );
    bExp := extractFloatx80Exp( b );
    bSign := extractFloatx80Sign( b );
    zSign := aSign xor bSign;
    if ( aExp = $7FFF ) then begin
        if ( bits64( aSig shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        if ( bExp = $7FFF ) then begin
            if ( bits64( bSig shl 1 ) <> 0 ) then begin
                result := propagateFloatx80NaN( a, b );
                exit;
            end;
            goto invalid;
        end;
        result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( bExp = $7FFF ) then begin
        if ( bits64( bSig shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        result := packFloatx80( zSign, 0, 0 );
        exit;
    end;
    if ( bExp = 0 ) then begin
        if ( bSig = 0 ) then begin
            if ( ( aExp or aSig ) = 0 ) then begin
 invalid:
                float_raise( float_flag_invalid );
                z.low := floatx80_default_nan_low;
                z.high := floatx80_default_nan_high;
                result := z;
                exit;
            end;
            float_raise( float_flag_divbyzero );
            result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
            exit;
        end;
        normalizeFloatx80Subnormal( bSig, bExp, bSig );
    end;
    if ( aExp = 0 ) then begin
        if ( aSig = 0 ) then begin
            result := packFloatx80( zSign, 0, 0 );
            exit;
        end;
        normalizeFloatx80Subnormal( aSig, aExp, aSig );
    end;
    zExp := aExp - bExp + $3FFE;
    rem1 := 0;
    if ( bSig <= aSig ) then begin
        shift128Right( aSig, 0, 1, aSig, rem1 );
        inc(zExp);
    end;
    zSig0 := estimateDiv128To64( aSig, rem1, bSig );
    mul64To128( bSig, zSig0, term0, term1 );
    sub128( aSig, rem1, term0, term1, rem0, rem1 );
    while ( sbits64( rem0 ) < 0 ) do begin
        dec(zSig0);
        add128( rem0, rem1, 0, bSig, rem0, rem1 );
    end;
    zSig1 := estimateDiv128To64( rem1, 0, bSig );
    if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
        mul64To128( bSig, zSig1, term1, term2 );
        sub128( rem1, 0, term1, term2, rem1, rem2 );
        while ( sbits64( rem1 ) < 0 ) do begin
            dec(zSig1);
            add128( rem1, rem2, 0, bSig, rem1, rem2 );
        end;
        zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
    end;
    result :=
        roundAndPackFloatx80(
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the remainder of the extended double-precision floating-point value
| `a' with respect to the corresponding value `b'.  The operation is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
var
    aSign, zSign: flag;
    aExp, bExp, expDiff: int32;
    aSig0, aSig1, bSig: bits64;
    q, term0, term1, alternateASig0, alternateASig1: bits64;
    z: floatx80;
label
    invalid;
begin
    aSig0 := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    bSig := extractFloatx80Frac( b );
    bExp := extractFloatx80Exp( b );
    if ( aExp = $7FFF ) then begin
        if ( bits64( aSig0 shl 1 ) <> 0 )
             or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        goto invalid;
    end;
    if ( bExp = $7FFF ) then begin
        if ( bits64( bSig shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, b );
            exit;
        end;
        result := a;
        exit;
    end;
    if ( bExp = 0 ) then begin
        if ( bSig = 0 ) then begin
 invalid:
            float_raise( float_flag_invalid );
            z.low := floatx80_default_nan_low;
            z.high := floatx80_default_nan_high;
            result := z;
            exit;
        end;
        normalizeFloatx80Subnormal( bSig, bExp, bSig );
    end;
    if ( aExp = 0 ) then begin
        if ( bits64( aSig0 shl 1 ) = 0 ) then begin
            result := a;
            exit;
        end;
        normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
    end;
    bSig := bSig or $8000000000000000;
    zSign := aSign;
    expDiff := aExp - bExp;
    aSig1 := 0;
    if ( expDiff < 0 ) then begin
        if ( expDiff < -1 ) then begin
            result := a;
            exit;
        end;
        shift128Right( aSig0, 0, 1, aSig0, aSig1 );
        expDiff := 0;
    end;
    q := ord( bSig <= aSig0 );
    if ( q <> 0 ) then dec( aSig0, bSig );
    dec( expDiff, 64 );
    while ( 0 < expDiff ) do begin
        q := estimateDiv128To64( aSig0, aSig1, bSig );
        if ( 2 < q ) then q := q - 2 else q := 0;
        mul64To128( bSig, q, term0, term1 );
        sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
        shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
        dec( expDiff, 62 );
    end;
    inc( expDiff, 64 );
    if ( 0 < expDiff ) then begin
        q := estimateDiv128To64( aSig0, aSig1, bSig );
        if ( 2 < q ) then q:= q - 2 else q := 0;
        q := q shr ( 64 - expDiff );
        mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
        sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
        shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
        while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
            inc(q);
            sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
        end;
    end
    else begin
        term1 := 0;
        term0 := bSig;
    end;
    sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
    if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
         or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
              and ( q and 1 <> 0 ) )
       then begin
        aSig0 := alternateASig0;
        aSig1 := alternateASig1;
        zSign := ord( zSign = 0 );
    end;
    result :=
        normalizeRoundAndPackFloatx80(
            80, zSign, bExp + expDiff, aSig0, aSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the square root of the extended double-precision floating-point
| value `a'.  The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_sqrt(a: floatx80): floatx80;
var
    aSign: flag;
    aExp, zExp: int32;
    aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
    rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
    z: floatx80;
label
    invalid;
begin
    aSig0 := extractFloatx80Frac( a );
    aExp := extractFloatx80Exp( a );
    aSign := extractFloatx80Sign( a );
    if ( aExp = $7FFF ) then begin
        if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
            result := propagateFloatx80NaN( a, a );
            exit;
        end;
        if ( aSign = 0 ) then begin
            result := a;
            exit;
        end;
        goto invalid;
    end;
    if ( aSign <> 0 ) then begin
        if ( ( aExp or aSig0 ) = 0 ) then begin
            result := a;
            exit;
        end;
 invalid:
        float_raise( float_flag_invalid );
        z.low := floatx80_default_nan_low;
        z.high := floatx80_default_nan_high;
        result := z;
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( aSig0 = 0 ) then begin
            result := packFloatx80( 0, 0, 0 );
            exit;
        end;
        normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
    end;
    zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
    zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
    shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
    zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
    doubleZSig0 := zSig0 shl 1;
    mul64To128( zSig0, zSig0, term0, term1 );
    sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
    while ( sbits64( rem0 ) < 0 ) do begin
        dec(zSig0);
        dec( doubleZSig0, 2 );
        add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
    end;
    zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
    if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
        if ( zSig1 = 0 ) then zSig1 := 1;
        mul64To128( doubleZSig0, zSig1, term1, term2 );
        sub128( rem1, 0, term1, term2, rem1, rem2 );
        mul64To128( zSig1, zSig1, term2, term3 );
        sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
        while ( sbits64( rem1 ) < 0 ) do begin
            dec(zSig1);
            shortShift128Left( 0, zSig1, 1, term2, term3 );
            term3 := term3 or 1;
            term2 := term2 or doubleZSig0;
            add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
        end;
        zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
    end;
    shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
    zSig0 := zSig0 or doubleZSig0;
    result :=
        roundAndPackFloatx80(
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_eq(a: floatx80; b: floatx80 ): flag;
begin
    if      (    ( extractFloatx80Exp( a ) = $7FFF )
              and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
       ) or (    ( extractFloatx80Exp( b ) = $7FFF )
              and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
       ) then begin
        if (    floatx80_is_signaling_nan( a )
             or floatx80_is_signaling_nan( b ) <> 0 ) then begin
            float_raise( float_flag_invalid );
        end;
        result := 0;
        exit;
    end;
    result := ord(
           ( a.low = b.low )
        and (    ( a.high = b.high )
             or (    ( a.low = 0 )
                  and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
           ) );
end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| less than or equal to the corresponding value `b', and 0 otherwise.  The
| comparison is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_le(a: floatx80; b: floatx80 ): flag;
var
    aSign, bSign: flag;
begin
    if      (     ( extractFloatx80Exp( a ) = $7FFF )
              and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
         or (     ( extractFloatx80Exp( b ) = $7FFF )
              and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
    then begin
        float_raise( float_flag_invalid );
        result := 0;
        exit;
    end;
    aSign := extractFloatx80Sign( a );
    bSign := extractFloatx80Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               ( aSign <> 0 )
            or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
        exit;
    end;
    if aSign<>0 then
        result := le128( b.high, b.low, a.high, a.low )
    else
        result := le128( a.high, a.low, b.high, b.low );
end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| less than the corresponding value `b', and 0 otherwise.  The comparison
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_lt(a: floatx80; b: floatx80 ): flag;
var
    aSign, bSign: flag;
begin
    if      (     ( extractFloatx80Exp( a ) = $7FFF )
              and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
         or (     ( extractFloatx80Exp( b ) = $7FFF )
              and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
    then begin
        float_raise( float_flag_invalid );
        result := 0;
        exit;
    end;
    aSign := extractFloatx80Sign( a );
    bSign := extractFloatx80Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
                ( aSign <> 0 )
            and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
        exit;
    end;
    if aSign <> 0 then
        result := lt128( b.high, b.low, a.high, a.low )
    else
        result := lt128( a.high, a.low, b.high, b.low );
end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is equal
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
| raised if either operand is a NaN.  Otherwise, the comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
begin
    if      (     ( extractFloatx80Exp( a ) = $7FFF )
              and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
         or (     ( extractFloatx80Exp( b ) = $7FFF )
              and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
    then begin
        float_raise( float_flag_invalid );
        result := 0;
        exit;
    end;
    result := ord(
           ( a.low = b.low )
        and (    ( a.high = b.high )
             or (    ( a.low = 0 )
                  and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
           )     );
end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is less
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
| do not cause an exception.  Otherwise, the comparison is performed according
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
var
    aSign, bSign: flag;
begin
    if      (    ( extractFloatx80Exp( a ) = $7FFF )
             and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
         or (    ( extractFloatx80Exp( b ) = $7FFF )
             and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
    then begin
        if (    floatx80_is_signaling_nan( a )
             or floatx80_is_signaling_nan( b ) <> 0 ) then begin
            float_raise( float_flag_invalid );
        end;
        result := 0;
        exit;
    end;
    aSign := extractFloatx80Sign( a );
    bSign := extractFloatx80Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               ( aSign <> 0 )
            or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
        exit;
    end;
    if aSign <> 0 then
        result := le128( b.high, b.low, a.high, a.low )
    else
        result := le128( a.high, a.low, b.high, b.low );
end;

{*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is less
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
| an exception.  Otherwise, the comparison is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
var
    aSign, bSign: flag;
begin
    if      (    ( extractFloatx80Exp( a ) = $7FFF )
             and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
         or (    ( extractFloatx80Exp( b ) = $7FFF )
             and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
    then begin
        if (    floatx80_is_signaling_nan( a )
             or floatx80_is_signaling_nan( b ) <> 0 ) then begin
            float_raise( float_flag_invalid );
        end;
        result := 0;
        exit;
    end;
    aSign := extractFloatx80Sign( a );
    bSign := extractFloatx80Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               ( aSign <> 0 )
            and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
        exit;
    end;
    if aSign <> 0 then
        result := lt128( b.high, b.low, a.high, a.low )
    else
        result := lt128( a.high, a.low, b.high, b.low );
end;

{$endif FPC_SOFTFLOAT_FLOATX80}


{$ifdef FPC_SOFTFLOAT_FLOAT128}

{*----------------------------------------------------------------------------
| Returns the least-significant 64 fraction bits of the quadruple-precision
| floating-point value `a'.
*----------------------------------------------------------------------------*}

function extractFloat128Frac1(a : float128): bits64;
begin
    result:=a.low;
end;

{*----------------------------------------------------------------------------
| Returns the most-significant 48 fraction bits of the quadruple-precision
| floating-point value `a'.
*----------------------------------------------------------------------------*}

function extractFloat128Frac0(a : float128): bits64;
begin
    result:=a.high and int64($0000FFFFFFFFFFFF);
end;

{*----------------------------------------------------------------------------
| Returns the exponent bits of the quadruple-precision floating-point value
| `a'.
*----------------------------------------------------------------------------*}

function extractFloat128Exp(a : float128): int32;
begin
    result:=( a.high shr 48 ) and $7FFF;
end;

{*----------------------------------------------------------------------------
| Returns the sign bit of the quadruple-precision floating-point value `a'.
*----------------------------------------------------------------------------*}

function extractFloat128Sign(a : float128): flag;
begin
    result:=a.high shr 63;
end;

{*----------------------------------------------------------------------------
| Normalizes the subnormal quadruple-precision floating-point value
| represented by the denormalized significand formed by the concatenation of
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
| significand are stored at the location pointed to by `zSig0Ptr', and the
| least significant 64 bits of the normalized significand are stored at the
| location pointed to by `zSig1Ptr'.
*----------------------------------------------------------------------------*}

procedure normalizeFloat128Subnormal(
     aSig0: bits64;
     aSig1: bits64;
     var zExpPtr: int32;
     var zSig0Ptr: bits64;
     var zSig1Ptr: bits64);
var
    shiftCount: int8;
begin
    if ( aSig0 = 0 ) then
    begin
        shiftCount := countLeadingZeros64( aSig1 ) - 15;
        if ( shiftCount < 0 ) then
        begin
            zSig0Ptr := aSig1 shr ( - shiftCount );
            zSig1Ptr := aSig1 shl ( shiftCount and 63 );
        end
        else begin
            zSig0Ptr := aSig1 shl shiftCount;
            zSig1Ptr := 0;
        end;
        zExpPtr := - shiftCount - 63;
    end
    else begin
        shiftCount := countLeadingZeros64( aSig0 ) - 15;
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
        zExpPtr := 1 - shiftCount;
    end;

end;

{*----------------------------------------------------------------------------
| Packs the sign `zSign', the exponent `zExp', and the significand formed
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
| floating-point value, returning the result.  After being shifted into the
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
| added together to form the most significant 32 bits of the result.  This
| means that any integer portion of `zSig0' will be added into the exponent.
| Since a properly normalized significand will have an integer portion equal
| to 1, the `zExp' input should be 1 less than the desired result exponent
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
| significand.
*----------------------------------------------------------------------------*}

function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
var
    z: float128;
begin
    z.low := zSig1;
    z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
    result:=z;
end;

{*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and extended significand formed by the concatenation of `zSig0', `zSig1',
| and `zSig2', and returns the proper quadruple-precision floating-point value
| corresponding to the abstract input.  Ordinarily, the abstract value is
| simply rounded and packed into the quadruple-precision format, with the
| inexact exception raised if the abstract input cannot be represented
| exactly.  However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned.  If the abstract value is too small, the input value is rounded to
| a subnormal number, and the underflow and inexact exceptions are raised if
| the abstract input cannot be represented exactly as a subnormal quadruple-
| precision floating-point number.
|     The input significand must be normalized or smaller.  If the input
| significand is not normalized, `zExp' must be 0; in that case, the result
| returned is a subnormal number, and it must not require rounding.  In the
| usual case that the input significand is normalized, `zExp' must be 1 less
| than the ``true'' floating-point exponent.  The handling of underflow and
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
var
    roundingMode: TFPURoundingMode;
    roundNearestEven, increment, isTiny: flag;
begin
    roundingMode := softfloat_rounding_mode;
    roundNearestEven := ord( roundingMode = float_round_nearest_even );
    increment := ord( sbits64(zSig2) < 0 );
    if ( roundNearestEven=0 ) then
    begin
        if ( roundingMode = float_round_to_zero ) then
        begin
            increment := 0;
        end
        else begin
            if ( zSign<>0 ) then
            begin
                increment := ord( roundingMode = float_round_down ) and zSig2;
            end
            else begin
                increment := ord( roundingMode = float_round_up ) and zSig2;
            end;
        end;
    end;
    if ( $7FFD <= bits32(zExp) ) then
    begin
        if (    ord( $7FFD < zExp )
             or (    ord( zExp = $7FFD )
                  and eq128(
                         int64( $0001FFFFFFFFFFFF ),
                         bits64( $FFFFFFFFFFFFFFFF ),
                         zSig0,
                         zSig1
                     )
                  and increment
                )
           )<>0 then
           begin
            float_raise( [float_flag_overflow,float_flag_inexact] );
            if (    ord( roundingMode = float_round_to_zero )
                 or ( zSign and ord( roundingMode = float_round_up ) )
                 or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
               )<>0 then
               begin
                result :=
                    packFloat128(
                        zSign,
                        $7FFE,
                        int64( $0000FFFFFFFFFFFF ),
                        bits64( $FFFFFFFFFFFFFFFF )
                    );
                exit;
            end;
            result:=packFloat128( zSign, $7FFF, 0, 0 );
            exit;
        end;
        if ( zExp < 0 ) then
        begin
            isTiny :=
                   ord(( softfloat_detect_tininess = float_tininess_before_rounding )
                or ( zExp < -1 )
                or not( increment<>0 )
                or boolean(lt128(
                       zSig0,
                       zSig1,
                       int64( $0001FFFFFFFFFFFF ),
                       bits64( $FFFFFFFFFFFFFFFF )
                   )));
            shift128ExtraRightJamming(
                zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
            zExp := 0;
            if ( isTiny and zSig2 )<>0 then
              float_raise( float_flag_underflow );
            if ( roundNearestEven<>0 ) then
            begin
                increment := ord( sbits64(zSig2) < 0 );
            end
            else begin
                if ( zSign<>0 ) then
                begin
                    increment := ord( roundingMode = float_round_down ) and zSig2;
                end
                else begin
                    increment := ord( roundingMode = float_round_up ) and zSig2;
                end;
            end;
        end;
    end;
    if ( zSig2<>0 ) then
      set_inexact_flag;
    if ( increment<>0 ) then
    begin
        add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
        zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
    end
    else begin
        if ( ( zSig0 or zSig1 ) = 0 ) then
          zExp := 0;
    end;
    result:=packFloat128( zSign, zExp, zSig0, zSig1 );
end;

{*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and significand formed by the concatenation of `zSig0' and `zSig1', and
| returns the proper quadruple-precision floating-point value corresponding
| to the abstract input.  This routine is just like `roundAndPackFloat128'
| except that the input significand has fewer bits and does not have to be
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
| point exponent.
*----------------------------------------------------------------------------*}

function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
var
    shiftCount: int8;
    zSig2: bits64;
begin
    if ( zSig0 = 0 ) then
    begin
        zSig0 := zSig1;
        zSig1 := 0;
        dec(zExp, 64);
    end;
    shiftCount := countLeadingZeros64( zSig0 ) - 15;
    if ( 0 <= shiftCount ) then
    begin
        zSig2 := 0;
        shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
    end
    else begin
        shift128ExtraRightJamming(
            zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
    end;
    dec(zExp, shiftCount);
    result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 32-bit two's complement integer format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic---which means in particular that the conversion is rounded
| according to the current rounding mode.  If `a' is a NaN, the largest
| positive integer is returned.  Otherwise, if the conversion overflows, the
| largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*}

function float128_to_int32(a: float128): int32;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig0, aSig1: bits64;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
      aSign := 0;
    if ( aExp<>0 ) then
      aSig0 := aSig0 or int64( $0001000000000000 );
    aSig0 := aSig0 or ord( aSig1 <> 0 );
    shiftCount := $4028 - aExp;
    if ( 0 < shiftCount ) then
      shift64RightJamming( aSig0, shiftCount, aSig0 );
    result := roundAndPackInt32( aSign, aSig0 );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 32-bit two's complement integer format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic, except that the conversion is always rounded toward zero.  If
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
| conversion overflows, the largest integer with the same sign as `a' is
| returned.
*----------------------------------------------------------------------------*}

function float128_to_int32_round_to_zero(a: float128): int32;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig0, aSig1, savedASig: bits64;
    z: int32;
label
    invalid;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    aSig0 := aSig0 or ord( aSig1 <> 0 );
    if ( $401E < aExp ) then
    begin
        if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
          aSign := 0;
        goto invalid;
    end
    else if ( aExp < $3FFF ) then
    begin
        if ( aExp or aSig0 )<>0 then
          set_inexact_flag;
        result := 0;
        exit;
    end;
    aSig0 := aSig0 or int64( $0001000000000000 );
    shiftCount := $402F - aExp;
    savedASig := aSig0;
    aSig0 := aSig0 shr shiftCount;
    z := aSig0;
    if ( aSign )<>0 then
      z := - z;
    if ( ord( z < 0 ) xor aSign )<>0 then
    begin
 invalid:
        float_raise( float_flag_invalid );
        if aSign<>0 then
          result:= int32( $80000000 )
        else
          result:=$7FFFFFFF;
        exit;
    end;
    if ( ( aSig0 shl shiftCount ) <> savedASig ) then
    begin
        set_inexact_flag;
    end;
    result := z;
end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 64-bit two's complement integer format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic---which means in particular that the conversion is rounded
| according to the current rounding mode.  If `a' is a NaN, the largest
| positive integer is returned.  Otherwise, if the conversion overflows, the
| largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*}

function float128_to_int64(a: float128): int64;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig0, aSig1: bits64;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( aExp<>0 ) then
      aSig0 := aSig0 or int64( $0001000000000000 );
    shiftCount := $402F - aExp;
    if ( shiftCount <= 0 ) then
    begin
        if ( $403E < aExp ) then
        begin
            float_raise( float_flag_invalid );
            if (    (aSign=0)
                 or (    ( aExp = $7FFF )
                      and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
                    )
               ) then
               begin
                result := int64( $7FFFFFFFFFFFFFFF );
                exit;
            end;
            result := int64( $8000000000000000 );
            exit;
        end;
        shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
    end
    else begin
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
    end;
    result := roundAndPackInt64( aSign, aSig0, aSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 64-bit two's complement integer format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic, except that the conversion is always rounded toward zero.
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
| the conversion overflows, the largest integer with the same sign as `a' is
| returned.
*----------------------------------------------------------------------------*}

function float128_to_int64_round_to_zero(a: float128): int64;
var
    aSign: flag;
    aExp, shiftCount: int32;
    aSig0, aSig1: bits64;
    z: int64;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( aExp<>0 ) then
      aSig0 := aSig0 or int64( $0001000000000000 );
    shiftCount := aExp - $402F;
    if ( 0 < shiftCount ) then
    begin
        if ( $403E <= aExp ) then
        begin
            aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
            if (    ( a.high = bits64( $C03E000000000000 ) )
                 and ( aSig1 < int64( $0002000000000000 ) ) ) then
            begin
                if ( aSig1<>0 ) then
                  set_inexact_flag;
            end
            else begin
                float_raise( float_flag_invalid );
                if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
                begin
                    result := int64( $7FFFFFFFFFFFFFFF );
                    exit;
                end;
            end;
            result := int64( $8000000000000000 );
            exit;
        end;
        z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
        if ( int64( aSig1 shl shiftCount )<>0 ) then
        begin
            set_inexact_flag;
        end;
    end
    else begin
        if ( aExp < $3FFF ) then
        begin
            if ( aExp or aSig0 or aSig1 )<>0 then
            begin
                set_inexact_flag;
            end;
            result := 0;
            exit;
        end;
        z := aSig0 shr ( - shiftCount );
        if (    (aSig1<>0)
             or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
        begin
            set_inexact_flag;
        end;
    end;
    if ( aSign<>0 ) then
      z := - z;
    result := z;

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the single-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function float128_to_float32(a: float128): float32;
var
    aSign: flag;
    aExp: int32;
    aSig0, aSig1: bits64;
    zSig: bits32;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( aExp = $7FFF ) then
    begin
        if ( aSig0 or aSig1 )<>0 then
        begin
            result := commonNaNToFloat32( float128ToCommonNaN( a ) );
            exit;
        end;
        result := packFloat32( aSign, $FF, 0 );
        exit;
    end;
    aSig0 := aSig0 or ord( aSig1 <> 0 );
    shift64RightJamming( aSig0, 18, aSig0 );
    zSig := aSig0;
    if ( aExp<>0 ) or (aSig0 <> 0 ) then
    begin
        zSig := zSig or $40000000;
        dec(aExp,$3F81);
    end;
    result := roundAndPackFloat32( aSign, aExp, zSig );

end;

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the double-precision floating-point format.  The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function float128_to_float64(a: float128): float64;
var
    aSign: flag;
    aExp: int32;
    aSig0, aSig1: bits64;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( aExp = $7FFF ) then
    begin
        if ( aSig0 or aSig1 )<>0 then
        begin
            result:=commonNaNToFloat64(float128ToCommonNaN(a));
            exit;
        end;
        result:=packFloat64( aSign, $7FF, 0);
        exit;
    end;
    shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
    aSig0 := aSig0 or ord( aSig1 <> 0 );
    if ( aExp<>0 ) or (aSig0 <> 0 ) then
    begin
        aSig0 := aSig0 or int64( $4000000000000000 );
        dec(aExp,$3C01);
    end;
    result := roundAndPackFloat64( aSign, aExp, aSig0 );
end;

{$ifdef FPC_SOFTFLOAT_FLOATX80}

{*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the extended double-precision floating-point format.  The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_to_floatx80(a: float128): floatx80;
var
    aSign: flag;
    aExp: int32;
    aSig0, aSig1: bits64;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( aExp = $7FFF ) then begin
        if ( aSig0 or aSig1 <> 0 ) then begin
            result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
            exit;
        end;
        result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( ( aSig0 or aSig1 ) = 0 ) then
          begin
            result := packFloatx80( aSign, 0, 0 );
            exit;
          end;
        normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    end
    else begin
        aSig0 := aSig0 or int64( $0001000000000000 );
    end;
    shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
    result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );

end;

{$endif FPC_SOFTFLOAT_FLOATX80}

{*----------------------------------------------------------------------------
| Rounds the quadruple-precision floating-point value `a' to an integer, and
| Returns the result as a quadruple-precision floating-point value.  The
| operation is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_round_to_int(a: float128): float128;
var
    aSign: flag;
    aExp: int32;
    lastBitMask, roundBitsMask: bits64;
    roundingMode: TFPURoundingMode;
    z: float128;
begin
    aExp := extractFloat128Exp( a );
    if ( $402F <= aExp ) then
    begin
        if ( $406F <= aExp ) then
        begin
            if (    ( aExp = $7FFF )
                 and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
               ) then
               begin
                result := propagateFloat128NaN( a, a );
                exit;
            end;
            result := a;
            exit;
        end;
        lastBitMask := 1;
        lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
        roundBitsMask := lastBitMask - 1;
        z := a;
        roundingMode := softfloat_rounding_mode;
        if ( roundingMode = float_round_nearest_even ) then
        begin
            if ( lastBitMask )<>0 then
            begin
                add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
                if ( ( z.low and roundBitsMask ) = 0 ) then
                  z.low := z.low and not(lastBitMask);
            end
            else begin
                if ( sbits64(z.low) < 0 ) then
                begin
                    inc(z.high);
                    if ( bits64( z.low shl 1 ) = 0 ) then
                      z.high := z.high and not bits64( 1 );
                end;
            end;
        end
        else if ( roundingMode <> float_round_to_zero ) then
        begin
            if (   extractFloat128Sign( z )
                 xor ord( roundingMode = float_round_up ) )<>0 then
            begin
                add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
            end;
        end;
        z.low := z.low and not(roundBitsMask);
    end
    else begin
        if ( aExp < $3FFF ) then
        begin
            if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
              begin
                result := a;
                exit;
              end;
            set_inexact_flag;
            aSign := extractFloat128Sign( a );
            case softfloat_rounding_mode of
            float_round_nearest_even:
                if (    ( aExp = $3FFE )
                     and (   (extractFloat128Frac0( a )<>0)
                          or (extractFloat128Frac1( a )<>0) )
                   ) then begin
                   begin
                     result := packFloat128( aSign, $3FFF, 0, 0 );
                     exit;
                   end;
                end;
             float_round_down:
               begin
                 if aSign<>0 then
                   result:=packFloat128( 1, $3FFF, 0, 0 )
                 else
                   result:=packFloat128( 0, 0, 0, 0 );
                 exit;
               end;
             float_round_up:
               begin
                 if aSign<>0 then
                   result := packFloat128( 1, 0, 0, 0 )
                 else
                   result:=packFloat128( 0, $3FFF, 0, 0 );
                exit;
               end;
            end;
            result := packFloat128( aSign, 0, 0, 0 );
            exit;
        end;
        lastBitMask := 1;
        lastBitMask  := lastBitMask shl ($402F - aExp);
        roundBitsMask := lastBitMask - 1;
        z.low := 0;
        z.high := a.high;
        roundingMode := softfloat_rounding_mode;
        if ( roundingMode = float_round_nearest_even ) then begin
            inc(z.high,lastBitMask shr 1);
            if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
                z.high := z.high and not(lastBitMask);
            end;
        end
        else if ( roundingMode <> float_round_to_zero ) then begin
            if (   (extractFloat128Sign( z )<>0)
                 xor ( roundingMode = float_round_up ) ) then begin
                z.high := z.high or ord( a.low <> 0 );
                z.high := z.high+roundBitsMask;
            end;
        end;
        z.high := z.high and not(roundBitsMask);
    end;
    if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
        set_inexact_flag;
    end;
    result := z;

end;

{*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the quadruple-precision
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
| before being returned.  `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
var
    aExp, bExp, zExp: int32;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
    expDiff: int32;
label
    shiftRight1,roundAndPack;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    bSig1 := extractFloat128Frac1( b );
    bSig0 := extractFloat128Frac0( b );
    bExp := extractFloat128Exp( b );
    expDiff := aExp - bExp;
    if ( 0 < expDiff ) then begin
        if ( aExp = $7FFF ) then begin
            if ( aSig0 or aSig1 )<>0 then
              begin
                result := propagateFloat128NaN( a, b );
                exit;
              end;
            result := a;
            exit;
        end;
        if ( bExp = 0 ) then begin
            dec(expDiff);
        end
        else begin
            bSig0 := bSig0 or int64( $0001000000000000 );
        end;
        shift128ExtraRightJamming(
            bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
        zExp := aExp;
    end
    else if ( expDiff < 0 ) then begin
        if ( bExp = $7FFF ) then begin
            if ( bSig0 or bSig1 )<>0 then
              begin
                result := propagateFloat128NaN( a, b );
                exit;
              end;
            result := packFloat128( zSign, $7FFF, 0, 0 );
            exit;
        end;
        if ( aExp = 0 ) then begin
            inc(expDiff);
        end
        else begin
            aSig0 := aSig0 or int64( $0001000000000000 );
        end;
        shift128ExtraRightJamming(
            aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
        zExp := bExp;
    end
    else begin
        if ( aExp = $7FFF ) then begin
            if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
                result := propagateFloat128NaN( a, b );
                exit;
            end;
            result := a;
            exit;
        end;
        add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
        if ( aExp = 0 ) then
          begin
            result := packFloat128( zSign, 0, zSig0, zSig1 );
            exit;
          end;
        zSig2 := 0;
        zSig0 := zSig0 or int64( $0002000000000000 );
        zExp := aExp;
        goto shiftRight1;
    end;
    aSig0 := aSig0 or int64( $0001000000000000 );
    add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
    dec(zExp);
    if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
    inc(zExp);
 shiftRight1:
    shift128ExtraRightJamming(
        zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
 roundAndPack:
    result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );

end;

{*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the quadruple-
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
| difference is negated before being returned.  `zSign' is ignored if the
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function subFloat128Sigs( a, b : float128;  zSign : flag): float128;
var
    aExp, bExp, zExp: int32;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
    expDiff: int32;
    z: float128;
label
    aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    bSig1 := extractFloat128Frac1( b );
    bSig0 := extractFloat128Frac0( b );
    bExp := extractFloat128Exp( b );
    expDiff := aExp - bExp;
    shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
    shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
    if ( 0 < expDiff ) then goto aExpBigger;
    if ( expDiff < 0 ) then goto bExpBigger;
    if ( aExp = $7FFF ) then begin
        if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
            result := propagateFloat128NaN( a, b );
            exit;
        end;
        float_raise( float_flag_invalid );
        z.low := float128_default_nan_low;
        z.high := float128_default_nan_high;
        result := z;
        exit;
    end;
    if ( aExp = 0 ) then begin
        aExp := 1;
        bExp := 1;
    end;
    if ( bSig0 < aSig0 ) then goto aBigger;
    if ( aSig0 < bSig0 ) then goto bBigger;
    if ( bSig1 < aSig1 ) then goto aBigger;
    if ( aSig1 < bSig1 ) then goto bBigger;
    result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
    exit;
 bExpBigger:
    if ( bExp = $7FFF ) then begin
        if ( bSig0 or bSig1 )<>0 then
          begin
            result := propagateFloat128NaN( a, b );
            exit;
          end;

        result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
        exit;
    end;
    if ( aExp = 0 ) then begin
        inc(expDiff);
    end
    else begin
        aSig0 := aSig0 or int64( $4000000000000000 );
    end;
    shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
    bSig0 := bSig0 or int64( $4000000000000000 );
 bBigger:
    sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
    zExp := bExp;
    zSign := zSign xor 1;
    goto normalizeRoundAndPack;
 aExpBigger:
    if ( aExp = $7FFF ) then begin
        if ( aSig0 or aSig1 )<>0 then
          begin
            result := propagateFloat128NaN( a, b );
            exit;
          end;
        result := a;
        exit;
    end;
    if ( bExp = 0 ) then begin
        dec(expDiff);
    end
    else begin
        bSig0 := bSig0 or int64( $4000000000000000 );
    end;
    shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
    aSig0 := aSig0 or int64( $4000000000000000 );
 aBigger:
    sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
    zExp := aExp;
 normalizeRoundAndPack:
    dec(zExp);
    result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the result of adding the quadruple-precision floating-point values
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_add(a: float128; b: float128): float128;
var
    aSign, bSign: flag;
begin
    aSign := extractFloat128Sign( a );
    bSign := extractFloat128Sign( b );
    if ( aSign = bSign ) then begin
        result := addFloat128Sigs( a, b, aSign );
    end
    else begin
        result := subFloat128Sigs( a, b, aSign );
    end;

end;

{*----------------------------------------------------------------------------
| Returns the result of subtracting the quadruple-precision floating-point
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_sub(a: float128; b: float128): float128;
var
    aSign, bSign: flag;
begin
    aSign := extractFloat128Sign( a );
    bSign := extractFloat128Sign( b );
    if ( aSign = bSign ) then begin
        result := subFloat128Sigs( a, b, aSign );
    end
    else begin
        result := addFloat128Sigs( a, b, aSign );
    end;

end;

{*----------------------------------------------------------------------------
| Returns the result of multiplying the quadruple-precision floating-point
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_mul(a: float128; b: float128): float128;
var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int32;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
    z: float128;
label
    invalid;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    bSig1 := extractFloat128Frac1( b );
    bSig0 := extractFloat128Frac0( b );
    bExp := extractFloat128Exp( b );
    bSign := extractFloat128Sign( b );
    zSign := aSign xor bSign;
    if ( aExp = $7FFF ) then begin
        if (    (( aSig0 or aSig1 )<>0)
             or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
            result := propagateFloat128NaN( a, b );
            exit;
        end;
        if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
        result := packFloat128( zSign, $7FFF, 0, 0 );
        exit;
    end;
    if ( bExp = $7FFF ) then begin
        if ( bSig0 or bSig1 )<>0 then
          begin
            result := propagateFloat128NaN( a, b );
            exit;
          end;
        if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
 invalid:
            float_raise( float_flag_invalid );
            z.low := float128_default_nan_low;
            z.high := float128_default_nan_high;
            result := z;
            exit;
        end;
        result := packFloat128( zSign, $7FFF, 0, 0 );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( ( aSig0 or aSig1 ) = 0 ) then
          begin
            result := packFloat128( zSign, 0, 0, 0 );
            exit;
          end;
        normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    end;
    if ( bExp = 0 ) then begin
        if ( ( bSig0 or bSig1 ) = 0 ) then
          begin
            result := packFloat128( zSign, 0, 0, 0 );
            exit;
          end;
        normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
    end;
    zExp := aExp + bExp - $4000;
    aSig0 := aSig0 or int64( $0001000000000000 );
    shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
    mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
    add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
    zSig2 := zSig2 or ord( zSig3 <> 0 );
    if ( int64( $0002000000000000 ) <= zSig0 ) then begin
        shift128ExtraRightJamming(
            zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
        inc(zExp);
    end;
    result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );

end;

{*----------------------------------------------------------------------------
| Returns the result of dividing the quadruple-precision floating-point value
| `a' by the corresponding value `b'.  The operation is performed according to
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_div(a: float128; b: float128): float128;
var
    aSign, bSign, zSign: flag;
    aExp, bExp, zExp: int32;
    aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
    rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
    z: float128;
label
    invalid;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    bSig1 := extractFloat128Frac1( b );
    bSig0 := extractFloat128Frac0( b );
    bExp := extractFloat128Exp( b );
    bSign := extractFloat128Sign( b );
    zSign := aSign xor bSign;
    if ( aExp = $7FFF ) then begin
        if ( aSig0 or aSig1 )<>0 then
        begin
          result := propagateFloat128NaN( a, b );
          exit;
        end;
        if ( bExp = $7FFF ) then begin
            if ( bSig0 or bSig1 )<>0 then
              begin
                result := propagateFloat128NaN( a, b );
                exit;
              end;
            goto invalid;
        end;
        result := packFloat128( zSign, $7FFF, 0, 0 );
        exit;
    end;
    if ( bExp = $7FFF ) then begin
        if ( bSig0 or bSig1 )<>0 then
          begin
            result := propagateFloat128NaN( a, b );
            exit;
          end;
        result := packFloat128( zSign, 0, 0, 0 );
        exit;
    end;
    if ( bExp = 0 ) then begin
        if ( ( bSig0 or bSig1 ) = 0 ) then begin
            if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
 invalid:
                float_raise( float_flag_invalid );
                z.low := float128_default_nan_low;
                z.high := float128_default_nan_high;
                result := z;
                exit;
            end;
            float_raise( float_flag_divbyzero );
            result := packFloat128( zSign, $7FFF, 0, 0 );
            exit;
        end;
        normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
    end;
    if ( aExp = 0 ) then begin
        if ( ( aSig0 or aSig1 ) = 0 ) then
          begin
            result := packFloat128( zSign, 0, 0, 0 );
            exit;
          end;
        normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    end;
    zExp := aExp - bExp + $3FFD;
    shortShift128Left(
        aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
    shortShift128Left(
        bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
    if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
        shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
        inc(zExp);
    end;
    zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
    mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
    sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
    while ( sbits64(rem0) < 0 ) do begin
        dec(zSig0);
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
    end;
    zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
    if ( ( zSig1 and $3FFF ) <= 4 ) then begin
        mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
        sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
        while ( sbits64(rem1) < 0 ) do begin
            dec(zSig1);
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
        end;
        zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
    end;
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
    result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );

end;

{*----------------------------------------------------------------------------
| Returns the remainder of the quadruple-precision floating-point value `a'
| with respect to the corresponding value `b'.  The operation is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_rem(a: float128; b: float128): float128;
var
    aSign, zSign: flag;
    aExp, bExp, expDiff: int32;
    aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
    allZero, alternateASig0, alternateASig1, sigMean1: bits64;
    sigMean0: sbits64;
    z: float128;
label
    invalid;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    bSig1 := extractFloat128Frac1( b );
    bSig0 := extractFloat128Frac0( b );
    bExp := extractFloat128Exp( b );
    if ( aExp = $7FFF ) then begin
        if (    (( aSig0 or aSig1 )<>0)
             or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
            result := propagateFloat128NaN( a, b );
            exit;
        end;
        goto invalid;
    end;
    if ( bExp = $7FFF ) then begin
        if ( bSig0 or bSig1 )<>0 then
          begin
            result := propagateFloat128NaN( a, b );
            exit;
          end;
        result := a;
        exit;
    end;
    if ( bExp = 0 ) then begin
        if ( ( bSig0 or bSig1 ) = 0 ) then begin
 invalid:
            float_raise( float_flag_invalid );
            z.low := float128_default_nan_low;
            z.high := float128_default_nan_high;
            result := z;
            exit;
        end;
        normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
    end;
    if ( aExp = 0 ) then begin
        if ( ( aSig0 or aSig1 ) = 0 ) then
          begin
            result := a;
            exit;
          end;
        normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    end;
    expDiff := aExp - bExp;
    if ( expDiff < -1 ) then
      begin
        result := a;
        exit;
      end;
    shortShift128Left(
        aSig0 or int64( $0001000000000000 ),
        aSig1,
        15 - ord( expDiff < 0 ),
        aSig0,
        aSig1
    );
    shortShift128Left(
        bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
    q := le128( bSig0, bSig1, aSig0, aSig1 );
    if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
    dec(expDiff,64);
    while ( 0 < expDiff ) do begin
        q := estimateDiv128To64( aSig0, aSig1, bSig0 );
        if ( 4 < q ) then
          q := q - 4
        else
          q := 0;
        mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
        shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
        shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
        sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
        dec(expDiff,61);
    end;
    if ( -64 < expDiff ) then begin
        q := estimateDiv128To64( aSig0, aSig1, bSig0 );
        if ( 4 < q ) then
          q := q - 4
        else
          q := 0;
        q := q shr (- expDiff);
        shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
        inc(expDiff,52);
        if ( expDiff < 0 ) then begin
            shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
        end
        else begin
            shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
        end;
        mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
        sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
    end
    else begin
        shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
        shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
    end;
    repeat
        alternateASig0 := aSig0;
        alternateASig1 := aSig1;
        inc(q);
        sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
    until not( 0 <= sbits64(aSig0) );
    add128(
        aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
    if (    ( sigMean0 < 0 )
         or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
        aSig0 := alternateASig0;
        aSig1 := alternateASig1;
    end;
    zSign := ord( sbits64(aSig0) < 0 );
    if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
    result :=
        normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );

end;

{*----------------------------------------------------------------------------
| Returns the square root of the quadruple-precision floating-point value `a'.
| The operation is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_sqrt(a: float128): float128;
var
    aSign: flag;
    aExp, zExp: int32;
    aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
    rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
    z: float128;
label
    invalid;
begin
    aSig1 := extractFloat128Frac1( a );
    aSig0 := extractFloat128Frac0( a );
    aExp := extractFloat128Exp( a );
    aSign := extractFloat128Sign( a );
    if ( aExp = $7FFF ) then begin
        if ( aSig0 or aSig1 )<>0 then
          begin
            result := propagateFloat128NaN( a, a );
            exit;
          end;
        if ( aSign=0 ) then
          begin
            result := a;
            exit;
          end;
        goto invalid;
    end;
    if ( aSign<>0 ) then begin
        if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
          begin
            result := a;
            exit;
          end;
 invalid:
        float_raise( float_flag_invalid );
        z.low := float128_default_nan_low;
        z.high := float128_default_nan_high;
        result := z;
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( ( aSig0 or aSig1 ) = 0 ) then
        begin
          result := packFloat128( 0, 0, 0, 0 );
          exit;
        end;
        normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
    end;
    zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
    aSig0 := aSig0 or int64( $0001000000000000 );
    zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
    shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
    zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
    doubleZSig0 := zSig0 shl 1;
    mul64To128( zSig0, zSig0, term0, term1 );
    sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
    while ( sbits64(rem0) < 0 ) do begin
        dec(zSig0);
        dec(doubleZSig0,2);
        add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
    end;
    zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
    if ( ( zSig1 and $1FFF ) <= 5 ) then begin
        if ( zSig1 = 0 ) then zSig1 := 1;
        mul64To128( doubleZSig0, zSig1, term1, term2 );
        sub128( rem1, 0, term1, term2, rem1, rem2 );
        mul64To128( zSig1, zSig1, term2, term3 );
        sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
        while ( sbits64(rem1) < 0 ) do begin
            dec(zSig1);
            shortShift128Left( 0, zSig1, 1, term2, term3 );
            term3 := term3 or 1;
            term2 := term2 or doubleZSig0;
            add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
        end;
        zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
    end;
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
    result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );

end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
| the corresponding value `b', and 0 otherwise.  The comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_eq(a: float128; b: float128): flag;
begin
    if (    (    ( extractFloat128Exp( a ) = $7FFF )
              and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
         or (    ( extractFloat128Exp( b ) = $7FFF )
              and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
       ) then begin
        if (    (float128_is_signaling_nan( a )<>0)
             or (float128_is_signaling_nan( b )<>0) ) then begin
            float_raise( float_flag_invalid );
        end;
        result := 0;
        exit;
    end;
    result := ord(
           ( a.low = b.low )
        and (    ( a.high = b.high )
             or (    ( a.low = 0 )
                  and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
           ));

end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is less than
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*}

function float128_le(a: float128; b: float128): flag;
var
    aSign, bSign: flag;
begin
    if (    (    ( extractFloat128Exp( a ) = $7FFF )
              and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
         or (    ( extractFloat128Exp( b ) = $7FFF )
              and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
       ) then begin
        float_raise( float_flag_invalid );
        result := 0;
        exit;
    end;
    aSign := extractFloat128Sign( a );
    bSign := extractFloat128Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               (aSign<>0)
            or (    ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
                 = 0 ));
        exit;
    end;
    if aSign<>0 then
      result := le128( b.high, b.low, a.high, a.low )
    else
      result := le128( a.high, a.low, b.high, b.low );

end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is less than
| the corresponding value `b', and 0 otherwise.  The comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_lt(a: float128; b: float128): flag;
var
    aSign, bSign: flag;
begin
    if (    (    ( extractFloat128Exp( a ) = $7FFF )
              and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
         or (    ( extractFloat128Exp( b ) = $7FFF )
              and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
       ) then begin
        float_raise( float_flag_invalid );
        result := 0;
        exit;
    end;
    aSign := extractFloat128Sign( a );
    bSign := extractFloat128Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               (aSign<>0)
            and (    ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
                 <> 0 ));
        exit;
    end;
    if aSign<>0 then
      result := lt128( b.high, b.low, a.high, a.low )
    else
      result := lt128( a.high, a.low, b.high, b.low );

end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
| the corresponding value `b', and 0 otherwise.  The invalid exception is
| raised if either operand is a NaN.  Otherwise, the comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_eq_signaling(a: float128; b: float128): flag;
begin
    if (    (    ( extractFloat128Exp( a ) = $7FFF )
              and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
         or (    ( extractFloat128Exp( b ) = $7FFF )
              and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
       ) then begin
        float_raise( float_flag_invalid );
        result := 0;
        exit;
    end;
    result := ord(
           ( a.low = b.low )
        and (    ( a.high = b.high )
             or (    ( a.low = 0 )
                  and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
           ));

end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is less than
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
| cause an exception.  Otherwise, the comparison is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_le_quiet(a: float128; b: float128): flag;
var
    aSign, bSign: flag;
begin
    if (    (    ( extractFloat128Exp( a ) = $7FFF )
              and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
         or (    ( extractFloat128Exp( b ) = $7FFF )
              and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
       ) then begin
        if (    (float128_is_signaling_nan( a )<>0)
             or (float128_is_signaling_nan( b )<>0) ) then begin
            float_raise( float_flag_invalid );
        end;
        result := 0;
        exit;
    end;
    aSign := extractFloat128Sign( a );
    bSign := extractFloat128Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               (aSign<>0)
            or (    ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
                 = 0 ));
        exit;
    end;
    if aSign<>0 then
      result := le128( b.high, b.low, a.high, a.low )
    else
      result := le128( a.high, a.low, b.high, b.low );

end;

{*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is less than
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*}

function float128_lt_quiet(a: float128; b: float128): flag;
var
    aSign, bSign: flag;
begin
    if (    (    ( extractFloat128Exp( a ) = $7FFF )
              and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
         or (    ( extractFloat128Exp( b ) = $7FFF )
              and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
       ) then begin
        if ( (float128_is_signaling_nan( a )<>0)
             or (float128_is_signaling_nan( b )<>0) ) then begin
            float_raise( float_flag_invalid );
        end;
        result := 0;
        exit;
    end;
    aSign := extractFloat128Sign( a );
    bSign := extractFloat128Sign( b );
    if ( aSign <> bSign ) then begin
        result := ord(
               (aSign<>0)
            and (    ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
                 <> 0 ));
        exit;
    end;
    if aSign<>0 then
      result:=lt128( b.high, b.low, a.high, a.low )
    else
      result:=lt128( a.high, a.low, b.high, b.low );

end;

{----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the quadruple-precision floating-point format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------}

function float64_to_float128( a : float64) : float128;
var
    aSign : flag;
    aExp : int16;
    aSig, zSig0, zSig1 : bits64;
begin
    aSig := extractFloat64Frac( a );
    aExp := extractFloat64Exp( a );
    aSign := extractFloat64Sign( a );
    if ( aExp = $7FF ) then begin
        if ( aSig<>0 ) then begin
          result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
          exit;
        end;
        result:=packFloat128( aSign, $7FFF, 0, 0 );
        exit;
    end;
    if ( aExp = 0 ) then begin
        if ( aSig = 0 ) then
          begin
            result:=packFloat128( aSign, 0, 0, 0 );
            exit;
          end;

        normalizeFloat64Subnormal( aSig, aExp, aSig );
        dec(aExp);
    end;
    shift128Right( aSig, 0, 4, zSig0, zSig1 );
    result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );

end;

{$endif FPC_SOFTFLOAT_FLOAT128}

{$endif not(defined(fpc_softfpu_interface))}

{$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}

end.

{$ifdef FPC}
  { restore context modified at implmentation start
    to possibly re-enable range and overflow checking explicitly}
  {$pop}
{$endif FPC}

{$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}