=============================================================================*/
+/* softfloat (and in particular the code in softfloat-specialize.h) is
+ * target-dependent and needs the TARGET_* macros.
+ */
+#include "config.h"
+
#include "softfloat.h"
/*----------------------------------------------------------------------------
STATUS(float_exception_flags) = val;
}
-#ifdef FLOATX80
void set_floatx80_rounding_precision(int val STATUS_PARAM)
{
STATUS(floatx80_rounding_precision) = val;
}
-#endif
/*----------------------------------------------------------------------------
| Returns the fraction bits of the half-precision floating-point value `a'.
| Returns the exponent bits of the half-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
-INLINE int16 extractFloat16Exp(float16 a)
+INLINE int_fast16_t extractFloat16Exp(float16 a)
{
return (float16_val(a) >> 10) & 0x1f;
}
int8 roundingMode;
flag roundNearestEven;
int8 roundIncrement, roundBits;
- int32 z;
+ int32_t z;
roundingMode = STATUS(float_rounding_mode);
roundNearestEven = ( roundingMode == float_round_nearest_even );
{
int8 roundingMode;
flag roundNearestEven, increment;
- int64 z;
+ int64_t z;
roundingMode = STATUS(float_rounding_mode);
roundNearestEven = ( roundingMode == float_round_nearest_even );
| Returns the exponent bits of the single-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
-INLINE int16 extractFloat32Exp( float32 a )
+INLINE int_fast16_t extractFloat32Exp(float32 a)
{
return ( float32_val(a)>>23 ) & 0xFF;
*----------------------------------------------------------------------------*/
static void
- normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
+ normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
{
int8 shiftCount;
| significand.
*----------------------------------------------------------------------------*/
-INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
+INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
{
return make_float32(
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
+static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
{
int8 roundingMode;
flag roundNearestEven;
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
}
if ( zExp < 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat32(zSign, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < -1 )
*----------------------------------------------------------------------------*/
static float32
- normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
+ normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
{
int8 shiftCount;
| Returns the exponent bits of the double-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
-INLINE int16 extractFloat64Exp( float64 a )
+INLINE int_fast16_t extractFloat64Exp(float64 a)
{
return ( float64_val(a)>>52 ) & 0x7FF;
*----------------------------------------------------------------------------*/
static void
- normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
+ normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
{
int8 shiftCount;
| significand.
*----------------------------------------------------------------------------*/
-INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
+INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
{
return make_float64(
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
+static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
{
int8 roundingMode;
flag roundNearestEven;
- int16 roundIncrement, roundBits;
+ int_fast16_t roundIncrement, roundBits;
flag isTiny;
roundingMode = STATUS(float_rounding_mode);
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
}
if ( zExp < 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat64(zSign, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < -1 )
*----------------------------------------------------------------------------*/
static float64
- normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
+ normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
{
int8 shiftCount;
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the fraction bits of the extended double-precision floating-point
| value `a'.
goto overflow;
}
if ( zExp <= 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloatx80(zSign, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < 0 )
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the least-significant 64 fraction bits of the quadruple-precision
| floating-point value `a'.
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( zExp < 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat128(zSign, 0, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < -1 )
}
-#endif
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a'
| to the single-precision floating-point format. The conversion is performed
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a'
| to the extended double-precision floating-point format. The conversion
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a' to
| the quadruple-precision floating-point format. The conversion is performed
}
-#endif
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the single-precision floating-point format. The conversion is performed
if ( a == 0 ) return float32_zero;
shiftCount = countLeadingZeros64( a ) - 40;
if ( 0 <= shiftCount ) {
- return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
+ return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
}
else {
shiftCount += 7;
else {
a <<= shiftCount;
}
- return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
+ return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
}
}
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the extended double-precision floating-point format. The conversion
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a' to
| the quadruple-precision floating-point format. The conversion is performed
}
-#endif
-
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the 32-bit two's complement integer format. The conversion is
int32 float32_to_int32( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
uint64_t aSig64;
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
- int32 z;
+ int32_t z;
a = float32_squash_input_denormal(a STATUS_VAR);
aSig = extractFloat32Frac( a );
| returned.
*----------------------------------------------------------------------------*/
-int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
+int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
int32 z;
int64 float32_to_int64( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
uint64_t aSig64, aSigExtra;
a = float32_squash_input_denormal(a STATUS_VAR);
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
uint64_t aSig64;
int64 z;
float64 float32_to_float64( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the double-precision floating-point format. The conversion is
float128 float32_to_float128( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the single-precision floating-point value `a' to an integer, and
| returns the result as a single-precision floating-point value. The
float32 float32_round_to_int( float32 a STATUS_PARAM)
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t lastBitMask, roundBitsMask;
int8 roundingMode;
uint32_t z;
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
return a;
}
if ( aExp == 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ if (aSig | bSig) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ }
+ return packFloat32(zSign, 0, 0);
+ }
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
}
zSig = 0x40000000 + aSig + bSig;
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig;
uint64_t zSig64;
uint32_t zSig;
float32 float32_div( float32 a, float32 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
a = float32_squash_input_denormal(a STATUS_VAR);
b = float32_squash_input_denormal(b STATUS_VAR);
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp, bExp, expDiff;
+ int_fast16_t aExp, bExp, expDiff;
uint32_t aSig, bSig;
uint32_t q;
uint64_t aSig64, bSig64, q64;
}
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the single-precision floating-point values
+| `a' and `b' then adding 'c', with no intermediate rounding step after the
+| multiplication. The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic 754-2008.
+| The flags argument allows the caller to select negation of the
+| addend, the intermediate product, or the final result. (The difference
+| between this and having the caller do a separate negation is that negating
+| externally will flip the sign bit on NaNs.)
+*----------------------------------------------------------------------------*/
+
+float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
+{
+ flag aSign, bSign, cSign, zSign;
+ int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
+ uint32_t aSig, bSig, cSig;
+ flag pInf, pZero, pSign;
+ uint64_t pSig64, cSig64, zSig64;
+ uint32_t pSig;
+ int shiftcount;
+ flag signflip, infzero;
+
+ a = float32_squash_input_denormal(a STATUS_VAR);
+ b = float32_squash_input_denormal(b STATUS_VAR);
+ c = float32_squash_input_denormal(c STATUS_VAR);
+ aSig = extractFloat32Frac(a);
+ aExp = extractFloat32Exp(a);
+ aSign = extractFloat32Sign(a);
+ bSig = extractFloat32Frac(b);
+ bExp = extractFloat32Exp(b);
+ bSign = extractFloat32Sign(b);
+ cSig = extractFloat32Frac(c);
+ cExp = extractFloat32Exp(c);
+ cSign = extractFloat32Sign(c);
+
+ infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
+ (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
+
+ /* It is implementation-defined whether the cases of (0,inf,qnan)
+ * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+ * they return if they do), so we have to hand this information
+ * off to the target-specific pick-a-NaN routine.
+ */
+ if (((aExp == 0xff) && aSig) ||
+ ((bExp == 0xff) && bSig) ||
+ ((cExp == 0xff) && cSig)) {
+ return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
+ }
+
+ if (infzero) {
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float32_default_nan;
+ }
+
+ if (flags & float_muladd_negate_c) {
+ cSign ^= 1;
+ }
+
+ signflip = (flags & float_muladd_negate_result) ? 1 : 0;
+
+ /* Work out the sign and type of the product */
+ pSign = aSign ^ bSign;
+ if (flags & float_muladd_negate_product) {
+ pSign ^= 1;
+ }
+ pInf = (aExp == 0xff) || (bExp == 0xff);
+ pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
+
+ if (cExp == 0xff) {
+ if (pInf && (pSign ^ cSign)) {
+ /* addition of opposite-signed infinities => InvalidOperation */
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float32_default_nan;
+ }
+ /* Otherwise generate an infinity of the same sign */
+ return packFloat32(cSign ^ signflip, 0xff, 0);
+ }
+
+ if (pInf) {
+ return packFloat32(pSign ^ signflip, 0xff, 0);
+ }
+
+ if (pZero) {
+ if (cExp == 0) {
+ if (cSig == 0) {
+ /* Adding two exact zeroes */
+ if (pSign == cSign) {
+ zSign = pSign;
+ } else if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign = 1;
+ } else {
+ zSign = 0;
+ }
+ return packFloat32(zSign ^ signflip, 0, 0);
+ }
+ /* Exact zero plus a denorm */
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat32(cSign ^ signflip, 0, 0);
+ }
+ }
+ /* Zero plus something non-zero : just return the something */
+ return make_float32(float32_val(c) ^ (signflip << 31));
+ }
+
+ if (aExp == 0) {
+ normalizeFloat32Subnormal(aSig, &aExp, &aSig);
+ }
+ if (bExp == 0) {
+ normalizeFloat32Subnormal(bSig, &bExp, &bSig);
+ }
+
+ /* Calculate the actual result a * b + c */
+
+ /* Multiply first; this is easy. */
+ /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
+ * because we want the true exponent, not the "one-less-than"
+ * flavour that roundAndPackFloat32() takes.
+ */
+ pExp = aExp + bExp - 0x7e;
+ aSig = (aSig | 0x00800000) << 7;
+ bSig = (bSig | 0x00800000) << 8;
+ pSig64 = (uint64_t)aSig * bSig;
+ if ((int64_t)(pSig64 << 1) >= 0) {
+ pSig64 <<= 1;
+ pExp--;
+ }
+
+ zSign = pSign ^ signflip;
+
+ /* Now pSig64 is the significand of the multiply, with the explicit bit in
+ * position 62.
+ */
+ if (cExp == 0) {
+ if (!cSig) {
+ /* Throw out the special case of c being an exact zero now */
+ shift64RightJamming(pSig64, 32, &pSig64);
+ pSig = pSig64;
+ return roundAndPackFloat32(zSign, pExp - 1,
+ pSig STATUS_VAR);
+ }
+ normalizeFloat32Subnormal(cSig, &cExp, &cSig);
+ }
+
+ cSig64 = (uint64_t)cSig << (62 - 23);
+ cSig64 |= LIT64(0x4000000000000000);
+ expDiff = pExp - cExp;
+
+ if (pSign == cSign) {
+ /* Addition */
+ if (expDiff > 0) {
+ /* scale c to match p */
+ shift64RightJamming(cSig64, expDiff, &cSig64);
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ /* scale p to match c */
+ shift64RightJamming(pSig64, -expDiff, &pSig64);
+ zExp = cExp;
+ } else {
+ /* no scaling needed */
+ zExp = cExp;
+ }
+ /* Add significands and make sure explicit bit ends up in posn 62 */
+ zSig64 = pSig64 + cSig64;
+ if ((int64_t)zSig64 < 0) {
+ shift64RightJamming(zSig64, 1, &zSig64);
+ } else {
+ zExp--;
+ }
+ } else {
+ /* Subtraction */
+ if (expDiff > 0) {
+ shift64RightJamming(cSig64, expDiff, &cSig64);
+ zSig64 = pSig64 - cSig64;
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ shift64RightJamming(pSig64, -expDiff, &pSig64);
+ zSig64 = cSig64 - pSig64;
+ zExp = cExp;
+ zSign ^= 1;
+ } else {
+ zExp = pExp;
+ if (cSig64 < pSig64) {
+ zSig64 = pSig64 - cSig64;
+ } else if (pSig64 < cSig64) {
+ zSig64 = cSig64 - pSig64;
+ zSign ^= 1;
+ } else {
+ /* Exact zero */
+ zSign = signflip;
+ if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign ^= 1;
+ }
+ return packFloat32(zSign, 0, 0);
+ }
+ }
+ --zExp;
+ /* Normalize to put the explicit bit back into bit 62. */
+ shiftcount = countLeadingZeros64(zSig64) - 1;
+ zSig64 <<= shiftcount;
+ zExp -= shiftcount;
+ }
+ shift64RightJamming(zSig64, 32, &zSig64);
+ return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
+}
+
+
/*----------------------------------------------------------------------------
| Returns the square root of the single-precision floating-point value `a'.
| The operation is performed according to the IEC/IEEE Standard for Binary
float32 float32_sqrt( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, zExp;
+ int_fast16_t aExp, zExp;
uint32_t aSig, zSig;
uint64_t rem, term;
a = float32_squash_input_denormal(a STATUS_VAR);
float32 float32_exp2( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
float64 r, x, xn;
int i;
float32 float32_log2( float32 a STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig, zSig, i;
a = float32_squash_input_denormal(a STATUS_VAR);
/*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is equal to
-| the corresponding value `b', and 0 otherwise. The comparison is performed
+| the corresponding value `b', and 0 otherwise. The invalid exception is
+| raised if either operand is a NaN. Otherwise, the comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
+int float32_eq( float32 a, float32 b STATUS_PARAM )
{
+ uint32_t av, bv;
a = float32_squash_input_denormal(a STATUS_VAR);
b = float32_squash_input_denormal(b STATUS_VAR);
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
) {
- if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
- float_raise( float_flag_invalid STATUS_VAR);
- }
+ float_raise( float_flag_invalid STATUS_VAR);
return 0;
}
- return ( float32_val(a) == float32_val(b) ) ||
- ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
-
+ av = float32_val(a);
+ bv = float32_val(b);
+ return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
}
/*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is less than
-| or equal to the corresponding value `b', and 0 otherwise. The comparison
-| is performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
+| or equal to the corresponding value `b', and 0 otherwise. The invalid
+| exception is raised if either operand is a NaN. The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float32_le( float32 a, float32 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is less than
-| the corresponding value `b', and 0 otherwise. The comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| the corresponding value `b', and 0 otherwise. The invalid exception is
+| raised if either operand is a NaN. The comparison is performed according
+| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float32_lt( float32 a, float32 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
-| be compared, and 0 otherwise. The comparison is performed according to the
-| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| be compared, and 0 otherwise. The invalid exception is raised if either
+| operand is a NaN. The comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float32_unordered( float32 a, float32 b STATUS_PARAM )
}
return 0;
}
+
/*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is equal to
-| the corresponding value `b', and 0 otherwise. The invalid exception is
-| raised if either operand is a NaN. Otherwise, the comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
+| exception. The comparison is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
+int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
{
- uint32_t av, bv;
a = float32_squash_input_denormal(a STATUS_VAR);
b = float32_squash_input_denormal(b STATUS_VAR);
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
) {
- float_raise( float_flag_invalid STATUS_VAR);
+ if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
+ float_raise( float_flag_invalid STATUS_VAR);
+ }
return 0;
}
- av = float32_val(a);
- bv = float32_val(b);
- return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
-
+ return ( float32_val(a) == float32_val(b) ) ||
+ ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
}
/*----------------------------------------------------------------------------
int32 float64_to_int32( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig;
a = float64_squash_input_denormal(a STATUS_VAR);
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig, savedASig;
- int32 z;
+ int32_t z;
a = float64_squash_input_denormal(a STATUS_VAR);
aSig = extractFloat64Frac( a );
| returned.
*----------------------------------------------------------------------------*/
-int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
+int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig, savedASig;
int32 z;
int64 float64_to_int64( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig, aSigExtra;
a = float64_squash_input_denormal(a STATUS_VAR);
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig;
int64 z;
a = float64_squash_input_denormal(a STATUS_VAR);
float32 float64_to_float32( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig;
uint32_t zSig;
a = float64_squash_input_denormal(a STATUS_VAR);
| than the desired result exponent whenever `zSig' is a complete, normalized
| significand.
*----------------------------------------------------------------------------*/
-static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
+static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
{
return make_float16(
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
aSign = extractFloat16Sign(a);
if (aSig) {
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
}
- return packFloat32(aSign, 0xff, aSig << 13);
+ return packFloat32(aSign, 0xff, 0);
}
if (aExp == 0) {
int8 shiftCount;
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
uint32_t mask;
uint32_t increment;
return packFloat16(aSign, aExp + 14, aSig >> 13);
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig;
a = float64_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the quadruple-precision floating-point format. The conversion is
float128 float64_to_float128( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig, zSig0, zSig1;
a = float64_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the double-precision floating-point value `a' to an integer, and
| returns the result as a double-precision floating-point value. The
float64 float64_round_to_int( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t lastBitMask, roundBitsMask;
int8 roundingMode;
uint64_t z;
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
return a;
}
if ( aExp == 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ if (aSig | bSig) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ }
+ return packFloat64(zSign, 0, 0);
+ }
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
}
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig0, zSig1;
a = float64_squash_input_denormal(a STATUS_VAR);
float64 float64_div( float64 a, float64 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
uint64_t rem0, rem1;
uint64_t term0, term1;
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp, bExp, expDiff;
+ int_fast16_t aExp, bExp, expDiff;
uint64_t aSig, bSig;
uint64_t q, alternateASig;
int64_t sigMean;
}
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the double-precision floating-point values
+| `a' and `b' then adding 'c', with no intermediate rounding step after the
+| multiplication. The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic 754-2008.
+| The flags argument allows the caller to select negation of the
+| addend, the intermediate product, or the final result. (The difference
+| between this and having the caller do a separate negation is that negating
+| externally will flip the sign bit on NaNs.)
+*----------------------------------------------------------------------------*/
+
+float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
+{
+ flag aSign, bSign, cSign, zSign;
+ int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
+ uint64_t aSig, bSig, cSig;
+ flag pInf, pZero, pSign;
+ uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
+ int shiftcount;
+ flag signflip, infzero;
+
+ a = float64_squash_input_denormal(a STATUS_VAR);
+ b = float64_squash_input_denormal(b STATUS_VAR);
+ c = float64_squash_input_denormal(c STATUS_VAR);
+ aSig = extractFloat64Frac(a);
+ aExp = extractFloat64Exp(a);
+ aSign = extractFloat64Sign(a);
+ bSig = extractFloat64Frac(b);
+ bExp = extractFloat64Exp(b);
+ bSign = extractFloat64Sign(b);
+ cSig = extractFloat64Frac(c);
+ cExp = extractFloat64Exp(c);
+ cSign = extractFloat64Sign(c);
+
+ infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
+ (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
+
+ /* It is implementation-defined whether the cases of (0,inf,qnan)
+ * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+ * they return if they do), so we have to hand this information
+ * off to the target-specific pick-a-NaN routine.
+ */
+ if (((aExp == 0x7ff) && aSig) ||
+ ((bExp == 0x7ff) && bSig) ||
+ ((cExp == 0x7ff) && cSig)) {
+ return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
+ }
+
+ if (infzero) {
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float64_default_nan;
+ }
+
+ if (flags & float_muladd_negate_c) {
+ cSign ^= 1;
+ }
+
+ signflip = (flags & float_muladd_negate_result) ? 1 : 0;
+
+ /* Work out the sign and type of the product */
+ pSign = aSign ^ bSign;
+ if (flags & float_muladd_negate_product) {
+ pSign ^= 1;
+ }
+ pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
+ pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
+
+ if (cExp == 0x7ff) {
+ if (pInf && (pSign ^ cSign)) {
+ /* addition of opposite-signed infinities => InvalidOperation */
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float64_default_nan;
+ }
+ /* Otherwise generate an infinity of the same sign */
+ return packFloat64(cSign ^ signflip, 0x7ff, 0);
+ }
+
+ if (pInf) {
+ return packFloat64(pSign ^ signflip, 0x7ff, 0);
+ }
+
+ if (pZero) {
+ if (cExp == 0) {
+ if (cSig == 0) {
+ /* Adding two exact zeroes */
+ if (pSign == cSign) {
+ zSign = pSign;
+ } else if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign = 1;
+ } else {
+ zSign = 0;
+ }
+ return packFloat64(zSign ^ signflip, 0, 0);
+ }
+ /* Exact zero plus a denorm */
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat64(cSign ^ signflip, 0, 0);
+ }
+ }
+ /* Zero plus something non-zero : just return the something */
+ return make_float64(float64_val(c) ^ ((uint64_t)signflip << 63));
+ }
+
+ if (aExp == 0) {
+ normalizeFloat64Subnormal(aSig, &aExp, &aSig);
+ }
+ if (bExp == 0) {
+ normalizeFloat64Subnormal(bSig, &bExp, &bSig);
+ }
+
+ /* Calculate the actual result a * b + c */
+
+ /* Multiply first; this is easy. */
+ /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
+ * because we want the true exponent, not the "one-less-than"
+ * flavour that roundAndPackFloat64() takes.
+ */
+ pExp = aExp + bExp - 0x3fe;
+ aSig = (aSig | LIT64(0x0010000000000000))<<10;
+ bSig = (bSig | LIT64(0x0010000000000000))<<11;
+ mul64To128(aSig, bSig, &pSig0, &pSig1);
+ if ((int64_t)(pSig0 << 1) >= 0) {
+ shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
+ pExp--;
+ }
+
+ zSign = pSign ^ signflip;
+
+ /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
+ * bit in position 126.
+ */
+ if (cExp == 0) {
+ if (!cSig) {
+ /* Throw out the special case of c being an exact zero now */
+ shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
+ return roundAndPackFloat64(zSign, pExp - 1,
+ pSig1 STATUS_VAR);
+ }
+ normalizeFloat64Subnormal(cSig, &cExp, &cSig);
+ }
+
+ /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
+ * significand of the addend, with the explicit bit in position 126.
+ */
+ cSig0 = cSig << (126 - 64 - 52);
+ cSig1 = 0;
+ cSig0 |= LIT64(0x4000000000000000);
+ expDiff = pExp - cExp;
+
+ if (pSign == cSign) {
+ /* Addition */
+ if (expDiff > 0) {
+ /* scale c to match p */
+ shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ /* scale p to match c */
+ shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
+ zExp = cExp;
+ } else {
+ /* no scaling needed */
+ zExp = cExp;
+ }
+ /* Add significands and make sure explicit bit ends up in posn 126 */
+ add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+ if ((int64_t)zSig0 < 0) {
+ shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
+ } else {
+ zExp--;
+ }
+ shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
+ return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
+ } else {
+ /* Subtraction */
+ if (expDiff > 0) {
+ shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
+ sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
+ sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
+ zExp = cExp;
+ zSign ^= 1;
+ } else {
+ zExp = pExp;
+ if (lt128(cSig0, cSig1, pSig0, pSig1)) {
+ sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+ } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
+ sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
+ zSign ^= 1;
+ } else {
+ /* Exact zero */
+ zSign = signflip;
+ if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign ^= 1;
+ }
+ return packFloat64(zSign, 0, 0);
+ }
+ }
+ --zExp;
+ /* Do the equivalent of normalizeRoundAndPackFloat64() but
+ * starting with the significand in a pair of uint64_t.
+ */
+ if (zSig0) {
+ shiftcount = countLeadingZeros64(zSig0) - 1;
+ shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
+ if (zSig1) {
+ zSig0 |= 1;
+ }
+ zExp -= shiftcount;
+ } else {
+ shiftcount = countLeadingZeros64(zSig1) - 1;
+ zSig0 = zSig1 << shiftcount;
+ zExp -= (shiftcount + 64);
+ }
+ return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
+ }
+}
+
/*----------------------------------------------------------------------------
| Returns the square root of the double-precision floating-point value `a'.
| The operation is performed according to the IEC/IEEE Standard for Binary
float64 float64_sqrt( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, zExp;
+ int_fast16_t aExp, zExp;
uint64_t aSig, zSig, doubleZSig;
uint64_t rem0, rem1, term0, term1;
a = float64_squash_input_denormal(a STATUS_VAR);
float64 float64_log2( float64 a STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig, aSig0, aSig1, zSig, i;
a = float64_squash_input_denormal(a STATUS_VAR);
/*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is equal to the
-| corresponding value `b', and 0 otherwise. The comparison is performed
+| corresponding value `b', and 0 otherwise. The invalid exception is raised
+| if either operand is a NaN. Otherwise, the comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
+int float64_eq( float64 a, float64 b STATUS_PARAM )
{
uint64_t av, bv;
a = float64_squash_input_denormal(a STATUS_VAR);
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
) {
- if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
- float_raise( float_flag_invalid STATUS_VAR);
- }
+ float_raise( float_flag_invalid STATUS_VAR);
return 0;
}
av = float64_val(a);
/*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is less than or
-| equal to the corresponding value `b', and 0 otherwise. The comparison is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
+| equal to the corresponding value `b', and 0 otherwise. The invalid
+| exception is raised if either operand is a NaN. The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float64_le( float64 a, float64 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is less than
-| the corresponding value `b', and 0 otherwise. The comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| the corresponding value `b', and 0 otherwise. The invalid exception is
+| raised if either operand is a NaN. The comparison is performed according
+| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float64_lt( float64 a, float64 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
-| be compared, and 0 otherwise. The comparison is performed according to the
-| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| be compared, and 0 otherwise. The invalid exception is raised if either
+| operand is a NaN. The comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float64_unordered( float64 a, float64 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is equal to the
-| corresponding value `b', and 0 otherwise. The invalid exception is raised
-| if either operand is a NaN. Otherwise, the comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
+| exception.The comparison is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
+int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
{
uint64_t av, bv;
a = float64_squash_input_denormal(a STATUS_VAR);
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
) {
- float_raise( float_flag_invalid STATUS_VAR);
+ if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
+ float_raise( float_flag_invalid STATUS_VAR);
+ }
return 0;
}
av = float64_val(a);
return 0;
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 32-bit two's complement integer format. The
flag aSign;
int32 aExp, shiftCount;
uint64_t aSig, savedASig;
- int32 z;
+ int32_t z;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
}
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the quadruple-precision floating-point format. The
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig, zSig0, zSig1;
aSig = extractFloatx80Frac( a );
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the extended double-precision floating-point value `a' to an integer,
| and returns the result as an extended quadruple-precision floating-point
}
/*----------------------------------------------------------------------------
-| Returns 1 if the extended double-precision floating-point value `a' is
-| equal to the corresponding value `b', and 0 otherwise. The comparison is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
+| Returns 1 if the extended double-precision floating-point value `a' is equal
+| to the corresponding value `b', and 0 otherwise. The invalid exception is
+| raised if either operand is a NaN. Otherwise, the comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
+int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
{
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
- if ( floatx80_is_signaling_nan( a )
- || floatx80_is_signaling_nan( b ) ) {
- float_raise( float_flag_invalid STATUS_VAR);
- }
+ float_raise( float_flag_invalid STATUS_VAR);
return 0;
}
return
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| less than or equal to the corresponding value `b', and 0 otherwise. The
-| comparison is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
+| invalid exception is raised if either operand is a NaN. The comparison is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
*----------------------------------------------------------------------------*/
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
-| less than the corresponding value `b', and 0 otherwise. The comparison
-| is performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
+| less than the corresponding value `b', and 0 otherwise. The invalid
+| exception is raised if either operand is a NaN. The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point values `a' and `b'
-| cannot be compared, and 0 otherwise. The comparison is performed according
-| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| cannot be compared, and 0 otherwise. The invalid exception is raised if
+| either operand is a NaN. The comparison is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
{
}
/*----------------------------------------------------------------------------
-| Returns 1 if the extended double-precision floating-point value `a' is equal
-| to the corresponding value `b', and 0 otherwise. The invalid exception is
-| raised if either operand is a NaN. Otherwise, the comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| Returns 1 if the extended double-precision floating-point value `a' is
+| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
+| cause an exception. The comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
+int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
{
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
- float_raise( float_flag_invalid STATUS_VAR);
+ if ( floatx80_is_signaling_nan( a )
+ || floatx80_is_signaling_nan( b ) ) {
+ float_raise( float_flag_invalid STATUS_VAR);
+ }
return 0;
}
return
return 0;
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 32-bit two's complement integer format. The conversion
flag aSign;
int32 aExp, shiftCount;
uint64_t aSig0, aSig1, savedASig;
- int32 z;
+ int32_t z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the extended double-precision floating-point format. The
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the quadruple-precision floating-point value `a' to an integer, and
| returns the result as a quadruple-precision floating-point value. The
}
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
if ( aExp == 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ if (zSig0 | zSig1) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ }
+ return packFloat128(zSign, 0, 0, 0);
+ }
return packFloat128( zSign, 0, zSig0, zSig1 );
}
zSig2 = 0;
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
-| the corresponding value `b', and 0 otherwise. The comparison is performed
+| the corresponding value `b', and 0 otherwise. The invalid exception is
+| raised if either operand is a NaN. Otherwise, the comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
+int float128_eq( float128 a, float128 b STATUS_PARAM )
{
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
- if ( float128_is_signaling_nan( a )
- || float128_is_signaling_nan( b ) ) {
- float_raise( float_flag_invalid STATUS_VAR);
- }
+ float_raise( float_flag_invalid STATUS_VAR);
return 0;
}
return
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is less than
-| or equal to the corresponding value `b', and 0 otherwise. The comparison
-| is performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
+| or equal to the corresponding value `b', and 0 otherwise. The invalid
+| exception is raised if either operand is a NaN. The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float128_le( float128 a, float128 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is less than
-| the corresponding value `b', and 0 otherwise. The comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| the corresponding value `b', and 0 otherwise. The invalid exception is
+| raised if either operand is a NaN. The comparison is performed according
+| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float128_lt( float128 a, float128 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
-| be compared, and 0 otherwise. The comparison is performed according to the
-| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| be compared, and 0 otherwise. The invalid exception is raised if either
+| operand is a NaN. The comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
int float128_unordered( float128 a, float128 b STATUS_PARAM )
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
-| the corresponding value `b', and 0 otherwise. The invalid exception is
-| raised if either operand is a NaN. Otherwise, the comparison is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
+| exception. The comparison is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
+int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
{
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
- float_raise( float_flag_invalid STATUS_VAR);
+ if ( float128_is_signaling_nan( a )
+ || float128_is_signaling_nan( b ) ) {
+ float_raise( float_flag_invalid STATUS_VAR);
+ }
return 0;
}
return
return 0;
}
-#endif
-
/* misc functions */
-float32 uint32_to_float32( unsigned int a STATUS_PARAM )
+float32 uint32_to_float32( uint32 a STATUS_PARAM )
{
return int64_to_float32(a STATUS_VAR);
}
-float64 uint32_to_float64( unsigned int a STATUS_PARAM )
+float64 uint32_to_float64( uint32 a STATUS_PARAM )
{
return int64_to_float64(a STATUS_VAR);
}
-unsigned int float32_to_uint32( float32 a STATUS_PARAM )
+uint32 float32_to_uint32( float32 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float32_to_int64(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
+uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float32_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
+uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
{
int64_t v;
- unsigned int res;
+ uint_fast16_t res;
v = float32_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float64_to_uint32( float64 a STATUS_PARAM )
+uint32 float64_to_uint32( float64 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float64_to_int64(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
+uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float64_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
+uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
{
int64_t v;
- unsigned int res;
+ uint_fast16_t res;
v = float64_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
COMPARE(32, 0xff)
COMPARE(64, 0x7ff)
+INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
+ int is_quiet STATUS_PARAM )
+{
+ flag aSign, bSign;
+
+ if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
+ ( extractFloatx80Frac( a )<<1 ) ) ||
+ ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
+ ( extractFloatx80Frac( b )<<1 ) )) {
+ if (!is_quiet ||
+ floatx80_is_signaling_nan( a ) ||
+ floatx80_is_signaling_nan( b ) ) {
+ float_raise( float_flag_invalid STATUS_VAR);
+ }
+ return float_relation_unordered;
+ }
+ aSign = extractFloatx80Sign( a );
+ bSign = extractFloatx80Sign( b );
+ if ( aSign != bSign ) {
+
+ if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
+ ( ( a.low | b.low ) == 0 ) ) {
+ /* zero case */
+ return float_relation_equal;
+ } else {
+ return 1 - (2 * aSign);
+ }
+ } else {
+ if (a.low == b.low && a.high == b.high) {
+ return float_relation_equal;
+ } else {
+ return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
+ }
+ }
+}
+
+int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
+{
+ return floatx80_compare_internal(a, b, 0 STATUS_VAR);
+}
+
+int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
+{
+ return floatx80_compare_internal(a, b, 1 STATUS_VAR);
+}
+
INLINE int float128_compare_internal( float128 a, float128 b,
int is_quiet STATUS_PARAM )
{
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
aSign = extractFloat32Sign( a );
if ( aExp == 0xFF ) {
+ if ( aSig ) {
+ return propagateFloat32NaN( a, a STATUS_VAR );
+ }
return a;
}
if ( aExp != 0 )
else if ( aSig == 0 )
return a;
+ if (n > 0x200) {
+ n = 0x200;
+ } else if (n < -0x200) {
+ n = -0x200;
+ }
+
aExp += n - 1;
aSig <<= 7;
return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int16_t aExp;
uint64_t aSig;
a = float64_squash_input_denormal(a STATUS_VAR);
aSign = extractFloat64Sign( a );
if ( aExp == 0x7FF ) {
+ if ( aSig ) {
+ return propagateFloat64NaN( a, a STATUS_VAR );
+ }
return a;
}
if ( aExp != 0 )
else if ( aSig == 0 )
return a;
+ if (n > 0x1000) {
+ n = 0x1000;
+ } else if (n < -0x1000) {
+ n = -0x1000;
+ }
+
aExp += n - 1;
aSig <<= 10;
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
}
-#ifdef FLOATX80
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int32_t aExp;
uint64_t aSig;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
- if ( aExp == 0x7FF ) {
+ if ( aExp == 0x7FFF ) {
+ if ( aSig<<1 ) {
+ return propagateFloatx80NaN( a, a STATUS_VAR );
+ }
return a;
}
+
if (aExp == 0 && aSig == 0)
return a;
+ if (n > 0x10000) {
+ n = 0x10000;
+ } else if (n < -0x10000) {
+ n = -0x10000;
+ }
+
aExp += n;
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
aSign, aExp, aSig, 0 STATUS_VAR );
}
-#endif
-#ifdef FLOAT128
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
{
flag aSign;
- int32 aExp;
+ int32_t aExp;
uint64_t aSig0, aSig1;
aSig1 = extractFloat128Frac1( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( aExp == 0x7FFF ) {
+ if ( aSig0 | aSig1 ) {
+ return propagateFloat128NaN( a, a STATUS_VAR );
+ }
return a;
}
if ( aExp != 0 )
else if ( aSig0 == 0 && aSig1 == 0 )
return a;
+ if (n > 0x10000) {
+ n = 0x10000;
+ } else if (n < -0x10000) {
+ n = -0x10000;
+ }
+
aExp += n - 1;
return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
STATUS_VAR );
}
-#endif