Quellcode durchsuchen

* bugfix for int64 to float conversion

carl vor 23 Jahren
Ursprung
Commit
c0a2149c38
1 geänderte Dateien mit 90 neuen und 94 gelöschten Zeilen
  1. 90 94
      rtl/inc/softfpu.pp

+ 90 - 94
rtl/inc/softfpu.pp

@@ -6,7 +6,7 @@ to pascal was done by Carl Eric Codere in 2002 ([email protected]).
 ===============================================================================
 
 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
-Arithmetic Package, Release 2a. 
+Arithmetic Package, Release 2a.
 
 Written by John R. Hauser.  This work was made possible in part by the
 International Computer Science Institute, located at Suite 600, 1947 Center
@@ -15,7 +15,7 @@ National Science Foundation under grant MIP-9311980.  The original version
 of this code was written as part of a project to build a fixed-point vector
 processor in collaboration with the University of California at Berkeley,
 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
-is available through the Web page 
+is available through the Web page
 `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
 
 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
@@ -36,7 +36,7 @@ unit softfpu;
 { Overflow checking must be disabled,
   since some operations expect overflow!
 }
-{$Q-}  
+{$Q-}
 
 interface
 
@@ -69,7 +69,7 @@ TYPE
   uint64 = qword;
   bits64 = qword;
   sbits64 = int64;
-  
+
 {$ifdef ENDIAN_LITTLE}
   float64 = packed record
     low: bits32;
@@ -98,7 +98,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_lt(a: float64;b: float64): flag; 
+Function float64_lt(a: float64;b: float64): flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the double-precision floating-point value `a' is less than
@@ -107,7 +107,7 @@ is performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_le(a: float64;b: float64): flag; 
+Function float64_le(a: float64;b: float64): flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the double-precision floating-point value `a' is equal to
@@ -115,7 +115,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_eq(a: float64;b: float64): flag; 
+Function float64_eq(a: float64;b: float64): flag;
 {*
 -------------------------------------------------------------------------------
 Returns the square root of the double-precision floating-point value `a'.
@@ -123,7 +123,7 @@ The operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_sqrt( a: float64; var out: float64 ); 
+Procedure float64_sqrt( a: float64; var out: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the remainder of the double-precision floating-point value `a'
@@ -131,7 +131,7 @@ with respect to the corresponding value `b'.  The operation is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_rem(a: float64; b : float64; var out: float64); 
+Procedure float64_rem(a: float64; b : float64; var out: float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of dividing the double-precision floating-point value `a'
@@ -139,7 +139,7 @@ by the corresponding value `b'.  The operation is performed according to the
 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_div(a: float64; b : float64 ; var out: float64 ); 
+Procedure float64_div(a: float64; b : float64 ; var out: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the result of multiplying the double-precision floating-point values
@@ -147,7 +147,7 @@ Returns the result of multiplying the double-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_mul( a: float64; b:float64; Var out: float64); 
+Procedure float64_mul( a: float64; b:float64; Var out: float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of subtracting the double-precision floating-point values
@@ -155,7 +155,7 @@ Returns the result of subtracting the double-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_sub(a: float64; b : float64; var out: float64); 
+Procedure float64_sub(a: float64; b : float64; var out: float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of adding the double-precision floating-point values `a'
@@ -163,7 +163,7 @@ and `b'.  The operation is performed according to the IEC/IEEE Standard for
 Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_add( a: float64; b : float64; Var out : float64); 
+Procedure float64_add( a: float64; b : float64; Var out : float64);
 {*
 -------------------------------------------------------------------------------
 Rounds the double-precision floating-point value `a' to an integer,
@@ -172,7 +172,7 @@ operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float64_round_to_int(a: float64; var out: float64 ); 
+Procedure float64_round_to_int(a: float64; var out: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the double-precision floating-point value
@@ -181,7 +181,7 @@ performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float64_to_float32(a: float64 ): float32; 
+Function float64_to_float32(a: float64 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the double-precision floating-point value
@@ -193,7 +193,7 @@ the conversion overflows, the largest integer with the same sign as `a' is
 returned.
 -------------------------------------------------------------------------------
 *}
-Function float64_to_int32_round_to_zero(a: float64 ): int32; 
+Function float64_to_int32_round_to_zero(a: float64 ): int32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the double-precision floating-point value
@@ -205,7 +205,7 @@ positive integer is returned.  Otherwise, if the conversion overflows, the
 largest integer with the same sign as `a' is returned.
 -------------------------------------------------------------------------------
 *}
-Function float64_to_int32(a: float64): int32; 
+Function float64_to_int32(a: float64): int32;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the single-precision floating-point value `a' is less than
@@ -213,7 +213,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_lt( a:float32 ; b : float32): flag; 
+Function float32_lt( a:float32 ; b : float32): flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the single-precision floating-point value `a' is less than
@@ -222,7 +222,7 @@ is performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_le( a: float32; b : float32 ):flag; 
+Function float32_le( a: float32; b : float32 ):flag;
 {*
 -------------------------------------------------------------------------------
 Returns 1 if the single-precision floating-point value `a' is equal to
@@ -230,7 +230,7 @@ the corresponding value `b', and 0 otherwise.  The comparison is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_eq( a:float32; b:float32): flag; 
+Function float32_eq( a:float32; b:float32): flag;
 {*
 -------------------------------------------------------------------------------
 Returns the square root of the single-precision floating-point value `a'.
@@ -238,7 +238,7 @@ The operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_sqrt(a: float32 ): float32; 
+Function float32_sqrt(a: float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the remainder of the single-precision floating-point value `a'
@@ -246,7 +246,7 @@ with respect to the corresponding value `b'.  The operation is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_rem(a: float32; b: float32 ):float32; 
+Function float32_rem(a: float32; b: float32 ):float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of dividing the single-precision floating-point value `a'
@@ -254,7 +254,7 @@ by the corresponding value `b'.  The operation is performed according to the
 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_div(a: float32;b: float32 ): float32; 
+Function float32_div(a: float32;b: float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of multiplying the single-precision floating-point values
@@ -262,7 +262,7 @@ Returns the result of multiplying the single-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_mul(a: float32; b: float32 ) : float32; 
+Function float32_mul(a: float32; b: float32 ) : float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of subtracting the single-precision floating-point values
@@ -270,7 +270,7 @@ Returns the result of subtracting the single-precision floating-point values
 for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_sub( a: float32 ; b:float32 ): float32; 
+Function float32_sub( a: float32 ; b:float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of adding the single-precision floating-point values `a'
@@ -278,7 +278,7 @@ and `b'.  The operation is performed according to the IEC/IEEE Standard for
 Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_add( a: float32; b:float32 ): float32; 
+Function float32_add( a: float32; b:float32 ): float32;
 {*
 -------------------------------------------------------------------------------
 Rounds the single-precision floating-point value `a' to an integer,
@@ -287,7 +287,7 @@ operation is performed according to the IEC/IEEE Standard for Binary
 Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function float32_round_to_int( a: float32): float32; 
+Function float32_round_to_int( a: float32): float32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the single-precision floating-point value
@@ -296,7 +296,7 @@ performed according to the IEC/IEEE Standard for Binary Floating-Point
 Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure float32_to_float64( a : float32; var out: Float64); 
+Procedure float32_to_float64( a : float32; var out: Float64);
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the single-precision floating-point value
@@ -308,7 +308,7 @@ the conversion overflows, the largest integer with the same sign as `a' is
 returned.
 -------------------------------------------------------------------------------
 *}
-Function float32_to_int32_round_to_zero( a: Float32 ): int32; 
+Function float32_to_int32_round_to_zero( a: Float32 ): int32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the single-precision floating-point value
@@ -320,7 +320,7 @@ positive integer is returned.  Otherwise, if the conversion overflows, the
 largest integer with the same sign as `a' is returned.
 -------------------------------------------------------------------------------
 *}
-Function float32_to_int32( a : float32) : int32; 
+Function float32_to_int32( a : float32) : int32;
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the 32-bit two's complement integer `a' to
@@ -328,7 +328,7 @@ the double-precision floating-point format.  The conversion is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Procedure int32_to_float64( a: int32; var c: float64 ); 
+Procedure int32_to_float64( a: int32; var c: float64 );
 {*
 -------------------------------------------------------------------------------
 Returns the result of converting the 32-bit two's complement integer `a' to
@@ -336,7 +336,7 @@ the single-precision floating-point format.  The conversion is performed
 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 -------------------------------------------------------------------------------
 *}
-Function int32_to_float32( a: int32): float32; 
+Function int32_to_float32( a: int32): float32;
 
 {*----------------------------------------------------------------------------
 | Returns the result of converting the 64-bit two's complement integer `a'
@@ -367,28 +367,28 @@ Software IEC/IEEE floating-point rounding mode.
 -------------------------------------------------------------------------------
 *}
 {
-Round to nearest. 
-This is the default mode. It should be used unless there is a specific 
-need for one of the others. In this mode results are rounded to the 
-nearest representable value. If the result is midway between two 
-representable values, the even representable is chosen. Even here 
-means the lowest-order bit is zero. This rounding mode prevents 
-statistical bias and guarantees numeric stability: round-off errors 
-in a lengthy calculation will remain smaller than half of FLT_EPSILON. 
-
-Round toward plus Infinity. 
-All results are rounded to the smallest representable value which is 
-greater than the result. 
-
-Round toward minus Infinity. 
-All results are rounded to the largest representable value which is 
-less than the result. 
-
-Round toward zero. 
-All results are rounded to the largest representable value whose 
-magnitude is less than that of the result. In other words, if the 
-result is negative it is rounded up; if it is positive, it is 
-rounded down. 
+Round to nearest.
+This is the default mode. It should be used unless there is a specific
+need for one of the others. In this mode results are rounded to the
+nearest representable value. If the result is midway between two
+representable values, the even representable is chosen. Even here
+means the lowest-order bit is zero. This rounding mode prevents
+statistical bias and guarantees numeric stability: round-off errors
+in a lengthy calculation will remain smaller than half of FLT_EPSILON.
+
+Round toward plus Infinity.
+All results are rounded to the smallest representable value which is
+greater than the result.
+
+Round toward minus Infinity.
+All results are rounded to the largest representable value which is
+less than the result.
+
+Round toward zero.
+All results are rounded to the largest representable value whose
+magnitude is less than that of the result. In other words, if the
+result is negative it is rounded up; if it is positive, it is
+rounded down.
 }
     float_round_nearest_even = 0;
     float_round_down         = 1;
@@ -443,7 +443,7 @@ Begin
   float_exception_flags := float_exception_flags or i;
   if (float_exception_flags and float_flag_invalid) <> 0 then
      RunError(207)
-  else  
+  else
   if (float_exception_flags and float_flag_divbyzero) <> 0 then
      RunError(200)
   else
@@ -479,7 +479,7 @@ var
 Begin
     if ( count = 0 ) then
         z := a
-   else 
+   else
     if ( count < 32 ) then
     Begin
         z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
@@ -557,7 +557,7 @@ Begin
         z1 := a1;
         z0 := a0;
     End
-   else 
+   else
     if ( count < 32 ) then
     Begin
         z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
@@ -569,7 +569,7 @@ Begin
         Begin
             z1 := a0 OR bits32( a1 <> 0 );
         End
-       else 
+       else
         if ( count < 64 ) Then
         Begin
             z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
@@ -1081,7 +1081,7 @@ End;
 function countLeadingZeros64( a : bits64): int8;
 var
  shiftcount : int8;
-Begin 
+Begin
     shiftCount := 0;
     if ( a <  (bits64(1)  shl 32 )) then
         shiftCount := shiftcount + 32
@@ -1441,7 +1441,7 @@ End;
   sign : flag;
   high, low : bits32;
  end;
- 
+
 (*----------------------------------------------------------------------------
 | The pattern for a default generated single-precision NaN.
 *----------------------------------------------------------------------------*)
@@ -1464,7 +1464,7 @@ function float32_is_signaling_nan(a: float32):flag;
  begin
    float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and ( (a and $003FFFFF)<>0 );
  end;
- 
+
 (*----------------------------------------------------------------------------
 | Returns the result of converting the single-precision floating-point NaN
 | `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
@@ -1490,7 +1490,7 @@ function CommonNanToFloat32(a : CommonNaNT): float32;
  begin
     CommonNanToFloat32:= ( ( (bits32) a.sign ) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  end;
- 
+
 (*----------------------------------------------------------------------------
 | Takes two single-precision floating-point values `a' and `b', one of which
 | is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
@@ -1607,7 +1607,7 @@ var
         c := a;
  end;
 
-{$ENDIF} 
+{$ENDIF}
 
 (****************************************************************************)
 (*                        END ENDIAN SPECIFIC CODE                          *)
@@ -4573,14 +4573,14 @@ Begin
       begin
         int64_to_float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
       end
-    else 
+    else
        begin
         shiftCount := shiftCount + 7;
         if ( shiftCount < 0 ) then
           begin
             intval.low := int64rec(AbsA).low;
             intval.high := int64rec(AbsA).high;
-            shift64RightJamming( intval.low, intval.high, - shiftCount, 
+            shift64RightJamming( intval.low, intval.high, - shiftCount,
                intval.low, intval.high);
             int64rec(absA).low := intval.low;
             int64rec(absA).high := intval.high;
@@ -4597,51 +4597,47 @@ End;
 | to the double-precision floating-point format.  The conversion is performed
 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 *----------------------------------------------------------------------------*}
-
 function int64_to_float64( a: int64 ): float64;
 var
  zSign : flag;
  float_result : float64;
  intval : int64rec;
+ AbsA : bits64;
+ shiftcount : int8;
+ zSig0, zSig1 : bits32;
 Begin
     if ( a = 0 ) then
-      begin
-        int64_to_float64.low := 0;
-        int64_to_float64.high := 0;
-        exit;
+      Begin
+       packFloat64( 0, 0, 0, 0, float_result );
+       exit;
       end;
-    if ( a =  sbits64 ( 1 shl 64 ) ) then
-      begin
-        packFloat64(1, $43E, 0, 0, float_result);
-        int64_to_float64 := float_result;
-        exit;
-      end;  
-    if a < 0 then
-      zSign := flag(TRUE)
+    zSign := flag( a < 0 );
+    if ZSign<>0 then
+      AbsA := -a
     else
-      zSign := flag(FALSE);
-    if zSign<>0 then 
-      a := -a;
-    if zSign <> 0 then
-     begin
-       a:=-a;
-       intval.low := int64rec(a).low;
-       intval.high := int64rec(a).high;
-       normalizeRoundAndPackFloat64( zSign, $43C, intval.low, intval.high , float_result )
-     end
+      AbsA := a;
+    shiftCount := countLeadingZeros64( absA ) - 11;
+    if ( 0 <= shiftCount ) then
+      Begin
+        absA := absA shl shiftcount;
+        zSig0:=int64rec(absA).high;
+        zSig1:=int64rec(absA).low;
+      End
     else
-     begin 
-       intval.low := int64rec(a).low;
-       intval.high := int64rec(a).high;
-       normalizeRoundAndPackFloat64( zSign, $43C, intval.low, intval.high , float_result );
-     end;
+      Begin
+        shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
+      End;
+    packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
     int64_to_float64:= float_result;
 End;
 
 end.
 {
    $Log$
-   Revision 1.3  2002-10-12 20:24:22  carl
+   Revision 1.4  2002-10-13 15:47:39  carl
+      * bugfix for int64 to float conversion
+
+   Revision 1.3  2002/10/12 20:24:22  carl
      + int64_tof_loat conversion routines
 
    Revision 1.2  2002/10/08 20:07:08  carl