* bugfix for int64 to float conversion

2025-04-19 23:19:24 +02:00 · 2002-10-13 15:47:39 +00:00 · 2002-10-13 15:47:39 +00:00 · c0a2149c38
commit c0a2149c38
parent 87d0fb3cda
1 changed files with 87 additions and 91 deletions
--- a/rtl/inc/softfpu.pp
+++ b/rtl/inc/softfpu.pp
@ -6,7 +6,7 @@ to pascal was done by Carl Eric Codere in 2002 (ccodere@ieee.org).
 ===============================================================================

 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
-Arithmetic Package, Release 2a. 
+Arithmetic Package, Release 2a.

 Written by John R. Hauser.  This work was made possible in part by the
 International Computer Science Institute, located at Suite 600, 1947 Center
@ -15,7 +15,7 @@ National Science Foundation under grant MIP-9311980.  The original version
 of this code was written as part of a project to build a fixed-point vector
 processor in collaboration with the University of California at Berkeley,
 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
-is available through the Web page 
+is available through the Web page
 `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.

 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
@ -36,7 +36,7 @@ unit softfpu;
 { Overflow checking must be disabled,
  since some operations expect overflow!
 }
-{$Q-}  
+{$Q-}

 interface

@ -69,7 +69,7 @@ TYPE
  uint64 = qword;
  bits64 = qword;
  sbits64 = int64;
-  
+
 {$ifdef ENDIAN_LITTLE}
  float64 = packed record
    low: bits32;
@ -98,7 +98,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_lt(a: float64;b: float64): flag; 
+Function float64_lt(a: float64;b: float64): flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the double-precision floating-point value `a' is less than
@ -107,7 +107,7 @@ is performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_le(a: float64;b: float64): flag; 
+Function float64_le(a: float64;b: float64): flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the double-precision floating-point value `a' is equal to
@ -115,7 +115,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_eq(a: float64;b: float64): flag; 
+Function float64_eq(a: float64;b: float64): flag;
 {*
 -------------------------------------------------------------------------------
 Returns the square root of the double-precision floating-point value `a'.
@ -123,7 +123,7 @@ The operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_sqrt( a: float64; var out: float64 ); 
+Procedure float64_sqrt( a: float64; var out: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the remainder of the double-precision floating-point value `a'
@ -131,7 +131,7 @@ with respect to the corresponding value `b'.  The operation is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_rem(a: float64; b : float64; var out: float64); 
+Procedure float64_rem(a: float64; b : float64; var out: float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of dividing the double-precision floating-point value `a'
@ -139,7 +139,7 @@ by the corresponding value `b'.  The operation is performed according to the
 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_div(a: float64; b : float64 ; var out: float64 ); 
+Procedure float64_div(a: float64; b : float64 ; var out: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the result of multiplying the double-precision floating-point values
@ -147,7 +147,7 @@ Returns the result of multiplying the double-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_mul( a: float64; b:float64; Var out: float64); 
+Procedure float64_mul( a: float64; b:float64; Var out: float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of subtracting the double-precision floating-point values
@ -155,7 +155,7 @@ Returns the result of subtracting the double-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_sub(a: float64; b : float64; var out: float64); 
+Procedure float64_sub(a: float64; b : float64; var out: float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of adding the double-precision floating-point values `a'
@ -163,7 +163,7 @@ and `b'.  The operation is performed according to the IEC/IEEE Standard for
 Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_add( a: float64; b : float64; Var out : float64); 
+Procedure float64_add( a: float64; b : float64; Var out : float64);
 {*
 -------------------------------------------------------------------------------
 Rounds the double-precision floating-point value `a' to an integer,
@ -172,7 +172,7 @@ operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_round_to_int(a: float64; var out: float64 ); 
+Procedure float64_round_to_int(a: float64; var out: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the double-precision floating-point value
@ -181,7 +181,7 @@ performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_to_float32(a: float64 ): float32; 
+Function float64_to_float32(a: float64 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the double-precision floating-point value
@ -193,7 +193,7 @@ the conversion overflows, the largest integer with the same sign as `a' is
 returned.
 -------------------------------------------------------------------------------
 *}
-Function float64_to_int32_round_to_zero(a: float64 ): int32; 
+Function float64_to_int32_round_to_zero(a: float64 ): int32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the double-precision floating-point value
@ -205,7 +205,7 @@ positive integer is returned.  Otherwise, if the conversion overflows, the
 largest integer with the same sign as `a' is returned.
 -------------------------------------------------------------------------------
 *}
-Function float64_to_int32(a: float64): int32; 
+Function float64_to_int32(a: float64): int32;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the single-precision floating-point value `a' is less than
@ -213,7 +213,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_lt( a:float32 ; b : float32): flag; 
+Function float32_lt( a:float32 ; b : float32): flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the single-precision floating-point value `a' is less than
@ -222,7 +222,7 @@ is performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_le( a: float32; b : float32 ):flag; 
+Function float32_le( a: float32; b : float32 ):flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the single-precision floating-point value `a' is equal to
@ -230,7 +230,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_eq( a:float32; b:float32): flag; 
+Function float32_eq( a:float32; b:float32): flag;
 {*
 -------------------------------------------------------------------------------
 Returns the square root of the single-precision floating-point value `a'.
@ -238,7 +238,7 @@ The operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_sqrt(a: float32 ): float32; 
+Function float32_sqrt(a: float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the remainder of the single-precision floating-point value `a'
@ -246,7 +246,7 @@ with respect to the corresponding value `b'.  The operation is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_rem(a: float32; b: float32 ):float32; 
+Function float32_rem(a: float32; b: float32 ):float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of dividing the single-precision floating-point value `a'
@ -254,7 +254,7 @@ by the corresponding value `b'.  The operation is performed according to the
 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_div(a: float32;b: float32 ): float32; 
+Function float32_div(a: float32;b: float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of multiplying the single-precision floating-point values
@ -262,7 +262,7 @@ Returns the result of multiplying the single-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_mul(a: float32; b: float32 ) : float32; 
+Function float32_mul(a: float32; b: float32 ) : float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of subtracting the single-precision floating-point values
@ -270,7 +270,7 @@ Returns the result of subtracting the single-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_sub( a: float32 ; b:float32 ): float32; 
+Function float32_sub( a: float32 ; b:float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of adding the single-precision floating-point values `a'
@ -278,7 +278,7 @@ and `b'.  The operation is performed according to the IEC/IEEE Standard for
 Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_add( a: float32; b:float32 ): float32; 
+Function float32_add( a: float32; b:float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Rounds the single-precision floating-point value `a' to an integer,
@ -287,7 +287,7 @@ operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_round_to_int( a: float32): float32; 
+Function float32_round_to_int( a: float32): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the single-precision floating-point value
@ -296,7 +296,7 @@ performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float32_to_float64( a : float32; var out: Float64); 
+Procedure float32_to_float64( a : float32; var out: Float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the single-precision floating-point value
@ -308,7 +308,7 @@ the conversion overflows, the largest integer with the same sign as `a' is
 returned.
 -------------------------------------------------------------------------------
 *}
-Function float32_to_int32_round_to_zero( a: Float32 ): int32; 
+Function float32_to_int32_round_to_zero( a: Float32 ): int32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the single-precision floating-point value
@ -320,7 +320,7 @@ positive integer is returned.  Otherwise, if the conversion overflows, the
 largest integer with the same sign as `a' is returned.
 -------------------------------------------------------------------------------
 *}
-Function float32_to_int32( a : float32) : int32; 
+Function float32_to_int32( a : float32) : int32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the 32-bit two's complement integer `a' to
@ -328,7 +328,7 @@ the double-precision floating-point format.  The conversion is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure int32_to_float64( a: int32; var c: float64 ); 
+Procedure int32_to_float64( a: int32; var c: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the 32-bit two's complement integer `a' to
@ -336,7 +336,7 @@ the single-precision floating-point format.  The conversion is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function int32_to_float32( a: int32): float32; 
+Function int32_to_float32( a: int32): float32;

 {*----------------------------------------------------------------------------
 | Returns the result of converting the 64-bit two's complement integer `a'
@ -367,28 +367,28 @@ Software IEC/IEEE floating-point rounding mode.
 -------------------------------------------------------------------------------
 *}
 {
-Round to nearest. 
-This is the default mode. It should be used unless there is a specific 
-need for one of the others. In this mode results are rounded to the 
-nearest representable value. If the result is midway between two 
-representable values, the even representable is chosen. Even here 
-means the lowest-order bit is zero. This rounding mode prevents 
-statistical bias and guarantees numeric stability: round-off errors 
-in a lengthy calculation will remain smaller than half of FLT_EPSILON. 
+Round to nearest.
+This is the default mode. It should be used unless there is a specific
+need for one of the others. In this mode results are rounded to the
+nearest representable value. If the result is midway between two
+representable values, the even representable is chosen. Even here
+means the lowest-order bit is zero. This rounding mode prevents
+statistical bias and guarantees numeric stability: round-off errors
+in a lengthy calculation will remain smaller than half of FLT_EPSILON.

-Round toward plus Infinity. 
-All results are rounded to the smallest representable value which is 
-greater than the result. 
+Round toward plus Infinity.
+All results are rounded to the smallest representable value which is
+greater than the result.

-Round toward minus Infinity. 
-All results are rounded to the largest representable value which is 
-less than the result. 
+Round toward minus Infinity.
+All results are rounded to the largest representable value which is
+less than the result.

-Round toward zero. 
-All results are rounded to the largest representable value whose 
-magnitude is less than that of the result. In other words, if the 
-result is negative it is rounded up; if it is positive, it is 
-rounded down. 
+Round toward zero.
+All results are rounded to the largest representable value whose
+magnitude is less than that of the result. In other words, if the
+result is negative it is rounded up; if it is positive, it is
+rounded down.
 }
    float_round_nearest_even = 0;
    float_round_down         = 1;
@ -443,7 +443,7 @@ Begin
  float_exception_flags := float_exception_flags or i;
  if (float_exception_flags and float_flag_invalid) <> 0 then
     RunError(207)
-  else  
+  else
  if (float_exception_flags and float_flag_divbyzero) <> 0 then
     RunError(200)
  else
@ -479,7 +479,7 @@ var
 Begin
    if ( count = 0 ) then
        z := a
-   else 
+   else
    if ( count < 32 ) then
    Begin
        z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
@ -557,7 +557,7 @@ Begin
        z1 := a1;
        z0 := a0;
    End
-   else 
+   else
    if ( count < 32 ) then
    Begin
        z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
@ -569,7 +569,7 @@ Begin
        Begin
            z1 := a0 OR bits32( a1 <> 0 );
        End
-       else 
+       else
        if ( count < 64 ) Then
        Begin
            z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
@ -1081,7 +1081,7 @@ End;
 function countLeadingZeros64( a : bits64): int8;
 var
 shiftcount : int8;
-Begin 
+Begin
    shiftCount := 0;
    if ( a <  (bits64(1)  shl 32 )) then
        shiftCount := shiftcount + 32
@ -1441,7 +1441,7 @@ End;
  sign : flag;
  high, low : bits32;
 end;
- 
+
 (*----------------------------------------------------------------------------
 | The pattern for a default generated single-precision NaN.
 *----------------------------------------------------------------------------*)
@ -1464,7 +1464,7 @@ function float32_is_signaling_nan(a: float32):flag;
 begin
   float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and ( (a and $003FFFFF)<>0 );
 end;
- 
+
 (*----------------------------------------------------------------------------
 | Returns the result of converting the single-precision floating-point NaN
 | `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
@ -1490,7 +1490,7 @@ function CommonNanToFloat32(a : CommonNaNT): float32;
 begin
    CommonNanToFloat32:= ( ( (bits32) a.sign ) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
 end;
- 
+
 (*----------------------------------------------------------------------------
 | Takes two single-precision floating-point values `a' and `b', one of which
 | is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
@ -1607,7 +1607,7 @@ var
        c := a;
 end;

-{$ENDIF} 
+{$ENDIF}

 (****************************************************************************)
 (*                        END ENDIAN SPECIFIC CODE                          *)
@ -4573,14 +4573,14 @@ Begin
      begin
        int64_to_float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
      end
-    else 
+    else
       begin
        shiftCount := shiftCount + 7;
        if ( shiftCount < 0 ) then
          begin
            intval.low := int64rec(AbsA).low;
            intval.high := int64rec(AbsA).high;
-            shift64RightJamming( intval.low, intval.high, - shiftCount, 
+            shift64RightJamming( intval.low, intval.high, - shiftCount,
               intval.low, intval.high);
            int64rec(absA).low := intval.low;
            int64rec(absA).high := intval.high;
@ -4597,51 +4597,47 @@ End;
 | to the double-precision floating-point format.  The conversion is performed
 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 *----------------------------------------------------------------------------*}
-
 function int64_to_float64( a: int64 ): float64;
 var
 zSign : flag;
 float_result : float64;
 intval : int64rec;
+ AbsA : bits64;
+ shiftcount : int8;
+ zSig0, zSig1 : bits32;
 Begin
    if ( a = 0 ) then
-      begin
-        int64_to_float64.low := 0;
-        int64_to_float64.high := 0;
-        exit;
+      Begin
+       packFloat64( 0, 0, 0, 0, float_result );
+       exit;
      end;
-    if ( a =  sbits64 ( 1 shl 64 ) ) then
-      begin
-        packFloat64(1, $43E, 0, 0, float_result);
-        int64_to_float64 := float_result;
-        exit;
-      end;  
-    if a < 0 then
-      zSign := flag(TRUE)
+    zSign := flag( a < 0 );
+    if ZSign<>0 then
+      AbsA := -a
    else
-      zSign := flag(FALSE);
-    if zSign<>0 then 
-      a := -a;
-    if zSign <> 0 then
-     begin
-       a:=-a;
-       intval.low := int64rec(a).low;
-       intval.high := int64rec(a).high;
-       normalizeRoundAndPackFloat64( zSign, $43C, intval.low, intval.high , float_result )
-     end
+      AbsA := a;
+    shiftCount := countLeadingZeros64( absA ) - 11;
+    if ( 0 <= shiftCount ) then
+      Begin
+        absA := absA shl shiftcount;
+        zSig0:=int64rec(absA).high;
+        zSig1:=int64rec(absA).low;
+      End
    else
-     begin 
-       intval.low := int64rec(a).low;
-       intval.high := int64rec(a).high;
-       normalizeRoundAndPackFloat64( zSign, $43C, intval.low, intval.high , float_result );
-     end;
+      Begin
+        shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
+      End;
+    packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
    int64_to_float64:= float_result;
 End;

 end.
 {
   $Log$
-   Revision 1.3  2002-10-12 20:24:22  carl
+   Revision 1.4  2002-10-13 15:47:39  carl
+      * bugfix for int64 to float conversion
+
+   Revision 1.3  2002/10/12 20:24:22  carl
     + int64_tof_loat conversion routines

   Revision 1.2  2002/10/08 20:07:08  carl