=============================================================================*/
+/* softfloat (and in particular the code in softfloat-specialize.h) is
+ * target-dependent and needs the TARGET_* macros.
+ */
+#include "config.h"
+
#include "softfloat.h"
/*----------------------------------------------------------------------------
STATUS(float_exception_flags) = val;
}
-#ifdef FLOATX80
void set_floatx80_rounding_precision(int val STATUS_PARAM)
{
STATUS(floatx80_rounding_precision) = val;
}
-#endif
/*----------------------------------------------------------------------------
| Returns the fraction bits of the half-precision floating-point value `a'.
| Returns the exponent bits of the half-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
-INLINE int16 extractFloat16Exp(float16 a)
+INLINE int_fast16_t extractFloat16Exp(float16 a)
{
return (float16_val(a) >> 10) & 0x1f;
}
int8 roundingMode;
flag roundNearestEven;
int8 roundIncrement, roundBits;
- int32 z;
+ int32_t z;
roundingMode = STATUS(float_rounding_mode);
roundNearestEven = ( roundingMode == float_round_nearest_even );
{
int8 roundingMode;
flag roundNearestEven, increment;
- int64 z;
+ int64_t z;
roundingMode = STATUS(float_rounding_mode);
roundNearestEven = ( roundingMode == float_round_nearest_even );
| Returns the exponent bits of the single-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
-INLINE int16 extractFloat32Exp( float32 a )
+INLINE int_fast16_t extractFloat32Exp(float32 a)
{
return ( float32_val(a)>>23 ) & 0xFF;
*----------------------------------------------------------------------------*/
static void
- normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
+ normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
{
int8 shiftCount;
| significand.
*----------------------------------------------------------------------------*/
-INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
+INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
{
return make_float32(
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
+static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
{
int8 roundingMode;
flag roundNearestEven;
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
}
if ( zExp < 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat32(zSign, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < -1 )
*----------------------------------------------------------------------------*/
static float32
- normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
+ normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
{
int8 shiftCount;
| Returns the exponent bits of the double-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
-INLINE int16 extractFloat64Exp( float64 a )
+INLINE int_fast16_t extractFloat64Exp(float64 a)
{
return ( float64_val(a)>>52 ) & 0x7FF;
*----------------------------------------------------------------------------*/
static void
- normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
+ normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
{
int8 shiftCount;
| significand.
*----------------------------------------------------------------------------*/
-INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
+INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
{
return make_float64(
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
-static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
+static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
{
int8 roundingMode;
flag roundNearestEven;
- int16 roundIncrement, roundBits;
+ int_fast16_t roundIncrement, roundBits;
flag isTiny;
roundingMode = STATUS(float_rounding_mode);
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
}
if ( zExp < 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat64(zSign, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < -1 )
*----------------------------------------------------------------------------*/
static float64
- normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
+ normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
{
int8 shiftCount;
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the fraction bits of the extended double-precision floating-point
| value `a'.
goto overflow;
}
if ( zExp <= 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloatx80(zSign, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < 0 )
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the least-significant 64 fraction bits of the quadruple-precision
| floating-point value `a'.
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( zExp < 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat128(zSign, 0, 0, 0);
+ }
isTiny =
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
|| ( zExp < -1 )
}
-#endif
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a'
| to the single-precision floating-point format. The conversion is performed
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a'
| to the extended double-precision floating-point format. The conversion
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a' to
| the quadruple-precision floating-point format. The conversion is performed
}
-#endif
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the single-precision floating-point format. The conversion is performed
if ( a == 0 ) return float32_zero;
shiftCount = countLeadingZeros64( a ) - 40;
if ( 0 <= shiftCount ) {
- return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
+ return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
}
else {
shiftCount += 7;
else {
a <<= shiftCount;
}
- return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
+ return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
}
}
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a'
| to the extended double-precision floating-point format. The conversion
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the 64-bit two's complement integer `a' to
| the quadruple-precision floating-point format. The conversion is performed
}
-#endif
-
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the 32-bit two's complement integer format. The conversion is
int32 float32_to_int32( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
uint64_t aSig64;
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
- int32 z;
+ int32_t z;
a = float32_squash_input_denormal(a STATUS_VAR);
aSig = extractFloat32Frac( a );
| returned.
*----------------------------------------------------------------------------*/
-int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
+int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
int32 z;
int64 float32_to_int64( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
uint64_t aSig64, aSigExtra;
a = float32_squash_input_denormal(a STATUS_VAR);
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint32_t aSig;
uint64_t aSig64;
int64 z;
float64 float32_to_float64( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the double-precision floating-point format. The conversion is
float128 float32_to_float128( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
a = float32_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the single-precision floating-point value `a' to an integer, and
| returns the result as a single-precision floating-point value. The
float32 float32_round_to_int( float32 a STATUS_PARAM)
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t lastBitMask, roundBitsMask;
int8 roundingMode;
uint32_t z;
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
return a;
}
if ( aExp == 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ if (aSig | bSig) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ }
+ return packFloat32(zSign, 0, 0);
+ }
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
}
zSig = 0x40000000 + aSig + bSig;
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig;
uint64_t zSig64;
uint32_t zSig;
float32 float32_div( float32 a, float32 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
a = float32_squash_input_denormal(a STATUS_VAR);
b = float32_squash_input_denormal(b STATUS_VAR);
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp, bExp, expDiff;
+ int_fast16_t aExp, bExp, expDiff;
uint32_t aSig, bSig;
uint32_t q;
uint64_t aSig64, bSig64, q64;
}
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the single-precision floating-point values
+| `a' and `b' then adding 'c', with no intermediate rounding step after the
+| multiplication. The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic 754-2008.
+| The flags argument allows the caller to select negation of the
+| addend, the intermediate product, or the final result. (The difference
+| between this and having the caller do a separate negation is that negating
+| externally will flip the sign bit on NaNs.)
+*----------------------------------------------------------------------------*/
+
+float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
+{
+ flag aSign, bSign, cSign, zSign;
+ int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
+ uint32_t aSig, bSig, cSig;
+ flag pInf, pZero, pSign;
+ uint64_t pSig64, cSig64, zSig64;
+ uint32_t pSig;
+ int shiftcount;
+ flag signflip, infzero;
+
+ a = float32_squash_input_denormal(a STATUS_VAR);
+ b = float32_squash_input_denormal(b STATUS_VAR);
+ c = float32_squash_input_denormal(c STATUS_VAR);
+ aSig = extractFloat32Frac(a);
+ aExp = extractFloat32Exp(a);
+ aSign = extractFloat32Sign(a);
+ bSig = extractFloat32Frac(b);
+ bExp = extractFloat32Exp(b);
+ bSign = extractFloat32Sign(b);
+ cSig = extractFloat32Frac(c);
+ cExp = extractFloat32Exp(c);
+ cSign = extractFloat32Sign(c);
+
+ infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
+ (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
+
+ /* It is implementation-defined whether the cases of (0,inf,qnan)
+ * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+ * they return if they do), so we have to hand this information
+ * off to the target-specific pick-a-NaN routine.
+ */
+ if (((aExp == 0xff) && aSig) ||
+ ((bExp == 0xff) && bSig) ||
+ ((cExp == 0xff) && cSig)) {
+ return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
+ }
+
+ if (infzero) {
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float32_default_nan;
+ }
+
+ if (flags & float_muladd_negate_c) {
+ cSign ^= 1;
+ }
+
+ signflip = (flags & float_muladd_negate_result) ? 1 : 0;
+
+ /* Work out the sign and type of the product */
+ pSign = aSign ^ bSign;
+ if (flags & float_muladd_negate_product) {
+ pSign ^= 1;
+ }
+ pInf = (aExp == 0xff) || (bExp == 0xff);
+ pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
+
+ if (cExp == 0xff) {
+ if (pInf && (pSign ^ cSign)) {
+ /* addition of opposite-signed infinities => InvalidOperation */
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float32_default_nan;
+ }
+ /* Otherwise generate an infinity of the same sign */
+ return packFloat32(cSign ^ signflip, 0xff, 0);
+ }
+
+ if (pInf) {
+ return packFloat32(pSign ^ signflip, 0xff, 0);
+ }
+
+ if (pZero) {
+ if (cExp == 0) {
+ if (cSig == 0) {
+ /* Adding two exact zeroes */
+ if (pSign == cSign) {
+ zSign = pSign;
+ } else if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign = 1;
+ } else {
+ zSign = 0;
+ }
+ return packFloat32(zSign ^ signflip, 0, 0);
+ }
+ /* Exact zero plus a denorm */
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat32(cSign ^ signflip, 0, 0);
+ }
+ }
+ /* Zero plus something non-zero : just return the something */
+ return make_float32(float32_val(c) ^ (signflip << 31));
+ }
+
+ if (aExp == 0) {
+ normalizeFloat32Subnormal(aSig, &aExp, &aSig);
+ }
+ if (bExp == 0) {
+ normalizeFloat32Subnormal(bSig, &bExp, &bSig);
+ }
+
+ /* Calculate the actual result a * b + c */
+
+ /* Multiply first; this is easy. */
+ /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
+ * because we want the true exponent, not the "one-less-than"
+ * flavour that roundAndPackFloat32() takes.
+ */
+ pExp = aExp + bExp - 0x7e;
+ aSig = (aSig | 0x00800000) << 7;
+ bSig = (bSig | 0x00800000) << 8;
+ pSig64 = (uint64_t)aSig * bSig;
+ if ((int64_t)(pSig64 << 1) >= 0) {
+ pSig64 <<= 1;
+ pExp--;
+ }
+
+ zSign = pSign ^ signflip;
+
+ /* Now pSig64 is the significand of the multiply, with the explicit bit in
+ * position 62.
+ */
+ if (cExp == 0) {
+ if (!cSig) {
+ /* Throw out the special case of c being an exact zero now */
+ shift64RightJamming(pSig64, 32, &pSig64);
+ pSig = pSig64;
+ return roundAndPackFloat32(zSign, pExp - 1,
+ pSig STATUS_VAR);
+ }
+ normalizeFloat32Subnormal(cSig, &cExp, &cSig);
+ }
+
+ cSig64 = (uint64_t)cSig << (62 - 23);
+ cSig64 |= LIT64(0x4000000000000000);
+ expDiff = pExp - cExp;
+
+ if (pSign == cSign) {
+ /* Addition */
+ if (expDiff > 0) {
+ /* scale c to match p */
+ shift64RightJamming(cSig64, expDiff, &cSig64);
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ /* scale p to match c */
+ shift64RightJamming(pSig64, -expDiff, &pSig64);
+ zExp = cExp;
+ } else {
+ /* no scaling needed */
+ zExp = cExp;
+ }
+ /* Add significands and make sure explicit bit ends up in posn 62 */
+ zSig64 = pSig64 + cSig64;
+ if ((int64_t)zSig64 < 0) {
+ shift64RightJamming(zSig64, 1, &zSig64);
+ } else {
+ zExp--;
+ }
+ } else {
+ /* Subtraction */
+ if (expDiff > 0) {
+ shift64RightJamming(cSig64, expDiff, &cSig64);
+ zSig64 = pSig64 - cSig64;
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ shift64RightJamming(pSig64, -expDiff, &pSig64);
+ zSig64 = cSig64 - pSig64;
+ zExp = cExp;
+ zSign ^= 1;
+ } else {
+ zExp = pExp;
+ if (cSig64 < pSig64) {
+ zSig64 = pSig64 - cSig64;
+ } else if (pSig64 < cSig64) {
+ zSig64 = cSig64 - pSig64;
+ zSign ^= 1;
+ } else {
+ /* Exact zero */
+ zSign = signflip;
+ if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign ^= 1;
+ }
+ return packFloat32(zSign, 0, 0);
+ }
+ }
+ --zExp;
+ /* Normalize to put the explicit bit back into bit 62. */
+ shiftcount = countLeadingZeros64(zSig64) - 1;
+ zSig64 <<= shiftcount;
+ zExp -= shiftcount;
+ }
+ shift64RightJamming(zSig64, 32, &zSig64);
+ return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
+}
+
+
/*----------------------------------------------------------------------------
| Returns the square root of the single-precision floating-point value `a'.
| The operation is performed according to the IEC/IEEE Standard for Binary
float32 float32_sqrt( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, zExp;
+ int_fast16_t aExp, zExp;
uint32_t aSig, zSig;
uint64_t rem, term;
a = float32_squash_input_denormal(a STATUS_VAR);
float32 float32_exp2( float32 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
float64 r, x, xn;
int i;
float32 float32_log2( float32 a STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig, zSig, i;
a = float32_squash_input_denormal(a STATUS_VAR);
int32 float64_to_int32( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig;
a = float64_squash_input_denormal(a STATUS_VAR);
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig, savedASig;
- int32 z;
+ int32_t z;
a = float64_squash_input_denormal(a STATUS_VAR);
aSig = extractFloat64Frac( a );
| returned.
*----------------------------------------------------------------------------*/
-int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
+int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig, savedASig;
int32 z;
int64 float64_to_int64( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig, aSigExtra;
a = float64_squash_input_denormal(a STATUS_VAR);
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, shiftCount;
+ int_fast16_t aExp, shiftCount;
uint64_t aSig;
int64 z;
a = float64_squash_input_denormal(a STATUS_VAR);
float32 float64_to_float32( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig;
uint32_t zSig;
a = float64_squash_input_denormal(a STATUS_VAR);
| than the desired result exponent whenever `zSig' is a complete, normalized
| significand.
*----------------------------------------------------------------------------*/
-static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
+static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
{
return make_float16(
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
aSign = extractFloat16Sign(a);
if (aSig) {
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
}
- return packFloat32(aSign, 0xff, aSig << 13);
+ return packFloat32(aSign, 0xff, 0);
}
if (aExp == 0) {
int8 shiftCount;
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint32_t aSig;
uint32_t mask;
uint32_t increment;
return packFloat16(aSign, aExp + 14, aSig >> 13);
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig;
a = float64_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the quadruple-precision floating-point format. The conversion is
float128 float64_to_float128( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig, zSig0, zSig1;
a = float64_squash_input_denormal(a STATUS_VAR);
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the double-precision floating-point value `a' to an integer, and
| returns the result as a double-precision floating-point value. The
float64 float64_round_to_int( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t lastBitMask, roundBitsMask;
int8 roundingMode;
uint64_t z;
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
return a;
}
if ( aExp == 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ if (aSig | bSig) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ }
+ return packFloat64(zSign, 0, 0);
+ }
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
}
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
{
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
- int16 expDiff;
+ int_fast16_t expDiff;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig0, zSig1;
a = float64_squash_input_denormal(a STATUS_VAR);
float64 float64_div( float64 a, float64 b STATUS_PARAM )
{
flag aSign, bSign, zSign;
- int16 aExp, bExp, zExp;
+ int_fast16_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
uint64_t rem0, rem1;
uint64_t term0, term1;
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp, bExp, expDiff;
+ int_fast16_t aExp, bExp, expDiff;
uint64_t aSig, bSig;
uint64_t q, alternateASig;
int64_t sigMean;
}
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the double-precision floating-point values
+| `a' and `b' then adding 'c', with no intermediate rounding step after the
+| multiplication. The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic 754-2008.
+| The flags argument allows the caller to select negation of the
+| addend, the intermediate product, or the final result. (The difference
+| between this and having the caller do a separate negation is that negating
+| externally will flip the sign bit on NaNs.)
+*----------------------------------------------------------------------------*/
+
+float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
+{
+ flag aSign, bSign, cSign, zSign;
+ int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
+ uint64_t aSig, bSig, cSig;
+ flag pInf, pZero, pSign;
+ uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
+ int shiftcount;
+ flag signflip, infzero;
+
+ a = float64_squash_input_denormal(a STATUS_VAR);
+ b = float64_squash_input_denormal(b STATUS_VAR);
+ c = float64_squash_input_denormal(c STATUS_VAR);
+ aSig = extractFloat64Frac(a);
+ aExp = extractFloat64Exp(a);
+ aSign = extractFloat64Sign(a);
+ bSig = extractFloat64Frac(b);
+ bExp = extractFloat64Exp(b);
+ bSign = extractFloat64Sign(b);
+ cSig = extractFloat64Frac(c);
+ cExp = extractFloat64Exp(c);
+ cSign = extractFloat64Sign(c);
+
+ infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
+ (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
+
+ /* It is implementation-defined whether the cases of (0,inf,qnan)
+ * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+ * they return if they do), so we have to hand this information
+ * off to the target-specific pick-a-NaN routine.
+ */
+ if (((aExp == 0x7ff) && aSig) ||
+ ((bExp == 0x7ff) && bSig) ||
+ ((cExp == 0x7ff) && cSig)) {
+ return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
+ }
+
+ if (infzero) {
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float64_default_nan;
+ }
+
+ if (flags & float_muladd_negate_c) {
+ cSign ^= 1;
+ }
+
+ signflip = (flags & float_muladd_negate_result) ? 1 : 0;
+
+ /* Work out the sign and type of the product */
+ pSign = aSign ^ bSign;
+ if (flags & float_muladd_negate_product) {
+ pSign ^= 1;
+ }
+ pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
+ pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
+
+ if (cExp == 0x7ff) {
+ if (pInf && (pSign ^ cSign)) {
+ /* addition of opposite-signed infinities => InvalidOperation */
+ float_raise(float_flag_invalid STATUS_VAR);
+ return float64_default_nan;
+ }
+ /* Otherwise generate an infinity of the same sign */
+ return packFloat64(cSign ^ signflip, 0x7ff, 0);
+ }
+
+ if (pInf) {
+ return packFloat64(pSign ^ signflip, 0x7ff, 0);
+ }
+
+ if (pZero) {
+ if (cExp == 0) {
+ if (cSig == 0) {
+ /* Adding two exact zeroes */
+ if (pSign == cSign) {
+ zSign = pSign;
+ } else if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign = 1;
+ } else {
+ zSign = 0;
+ }
+ return packFloat64(zSign ^ signflip, 0, 0);
+ }
+ /* Exact zero plus a denorm */
+ if (STATUS(flush_to_zero)) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ return packFloat64(cSign ^ signflip, 0, 0);
+ }
+ }
+ /* Zero plus something non-zero : just return the something */
+ return make_float64(float64_val(c) ^ ((uint64_t)signflip << 63));
+ }
+
+ if (aExp == 0) {
+ normalizeFloat64Subnormal(aSig, &aExp, &aSig);
+ }
+ if (bExp == 0) {
+ normalizeFloat64Subnormal(bSig, &bExp, &bSig);
+ }
+
+ /* Calculate the actual result a * b + c */
+
+ /* Multiply first; this is easy. */
+ /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
+ * because we want the true exponent, not the "one-less-than"
+ * flavour that roundAndPackFloat64() takes.
+ */
+ pExp = aExp + bExp - 0x3fe;
+ aSig = (aSig | LIT64(0x0010000000000000))<<10;
+ bSig = (bSig | LIT64(0x0010000000000000))<<11;
+ mul64To128(aSig, bSig, &pSig0, &pSig1);
+ if ((int64_t)(pSig0 << 1) >= 0) {
+ shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
+ pExp--;
+ }
+
+ zSign = pSign ^ signflip;
+
+ /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
+ * bit in position 126.
+ */
+ if (cExp == 0) {
+ if (!cSig) {
+ /* Throw out the special case of c being an exact zero now */
+ shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
+ return roundAndPackFloat64(zSign, pExp - 1,
+ pSig1 STATUS_VAR);
+ }
+ normalizeFloat64Subnormal(cSig, &cExp, &cSig);
+ }
+
+ /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
+ * significand of the addend, with the explicit bit in position 126.
+ */
+ cSig0 = cSig << (126 - 64 - 52);
+ cSig1 = 0;
+ cSig0 |= LIT64(0x4000000000000000);
+ expDiff = pExp - cExp;
+
+ if (pSign == cSign) {
+ /* Addition */
+ if (expDiff > 0) {
+ /* scale c to match p */
+ shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ /* scale p to match c */
+ shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
+ zExp = cExp;
+ } else {
+ /* no scaling needed */
+ zExp = cExp;
+ }
+ /* Add significands and make sure explicit bit ends up in posn 126 */
+ add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+ if ((int64_t)zSig0 < 0) {
+ shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
+ } else {
+ zExp--;
+ }
+ shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
+ return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
+ } else {
+ /* Subtraction */
+ if (expDiff > 0) {
+ shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
+ sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+ zExp = pExp;
+ } else if (expDiff < 0) {
+ shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
+ sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
+ zExp = cExp;
+ zSign ^= 1;
+ } else {
+ zExp = pExp;
+ if (lt128(cSig0, cSig1, pSig0, pSig1)) {
+ sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+ } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
+ sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
+ zSign ^= 1;
+ } else {
+ /* Exact zero */
+ zSign = signflip;
+ if (STATUS(float_rounding_mode) == float_round_down) {
+ zSign ^= 1;
+ }
+ return packFloat64(zSign, 0, 0);
+ }
+ }
+ --zExp;
+ /* Do the equivalent of normalizeRoundAndPackFloat64() but
+ * starting with the significand in a pair of uint64_t.
+ */
+ if (zSig0) {
+ shiftcount = countLeadingZeros64(zSig0) - 1;
+ shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
+ if (zSig1) {
+ zSig0 |= 1;
+ }
+ zExp -= shiftcount;
+ } else {
+ shiftcount = countLeadingZeros64(zSig1) - 1;
+ zSig0 = zSig1 << shiftcount;
+ zExp -= (shiftcount + 64);
+ }
+ return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
+ }
+}
+
/*----------------------------------------------------------------------------
| Returns the square root of the double-precision floating-point value `a'.
| The operation is performed according to the IEC/IEEE Standard for Binary
float64 float64_sqrt( float64 a STATUS_PARAM )
{
flag aSign;
- int16 aExp, zExp;
+ int_fast16_t aExp, zExp;
uint64_t aSig, zSig, doubleZSig;
uint64_t rem0, rem1, term0, term1;
a = float64_squash_input_denormal(a STATUS_VAR);
float64 float64_log2( float64 a STATUS_PARAM )
{
flag aSign, zSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig, aSig0, aSig1, zSig, i;
a = float64_squash_input_denormal(a STATUS_VAR);
return 0;
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 32-bit two's complement integer format. The
flag aSign;
int32 aExp, shiftCount;
uint64_t aSig, savedASig;
- int32 z;
+ int32_t z;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
}
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the quadruple-precision floating-point format. The
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
{
flag aSign;
- int16 aExp;
+ int_fast16_t aExp;
uint64_t aSig, zSig0, zSig1;
aSig = extractFloatx80Frac( a );
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the extended double-precision floating-point value `a' to an integer,
| and returns the result as an extended quadruple-precision floating-point
return 0;
}
-#endif
-
-#ifdef FLOAT128
-
/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 32-bit two's complement integer format. The conversion
flag aSign;
int32 aExp, shiftCount;
uint64_t aSig0, aSig1, savedASig;
- int32 z;
+ int32_t z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
}
-#ifdef FLOATX80
-
/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the extended double-precision floating-point format. The
}
-#endif
-
/*----------------------------------------------------------------------------
| Rounds the quadruple-precision floating-point value `a' to an integer, and
| returns the result as a quadruple-precision floating-point value. The
}
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
if ( aExp == 0 ) {
- if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
+ if (STATUS(flush_to_zero)) {
+ if (zSig0 | zSig1) {
+ float_raise(float_flag_output_denormal STATUS_VAR);
+ }
+ return packFloat128(zSign, 0, 0, 0);
+ }
return packFloat128( zSign, 0, zSig0, zSig1 );
}
zSig2 = 0;
return 0;
}
-#endif
-
/* misc functions */
-float32 uint32_to_float32( unsigned int a STATUS_PARAM )
+float32 uint32_to_float32( uint32 a STATUS_PARAM )
{
return int64_to_float32(a STATUS_VAR);
}
-float64 uint32_to_float64( unsigned int a STATUS_PARAM )
+float64 uint32_to_float64( uint32 a STATUS_PARAM )
{
return int64_to_float64(a STATUS_VAR);
}
-unsigned int float32_to_uint32( float32 a STATUS_PARAM )
+uint32 float32_to_uint32( float32 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float32_to_int64(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
+uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float32_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
+uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
{
int64_t v;
- unsigned int res;
+ uint_fast16_t res;
v = float32_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float64_to_uint32( float64 a STATUS_PARAM )
+uint32 float64_to_uint32( float64 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float64_to_int64(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
+uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
{
int64_t v;
- unsigned int res;
+ uint32 res;
v = float64_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return res;
}
-unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
+uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
{
int64_t v;
- unsigned int res;
+ uint_fast16_t res;
v = float64_to_int64_round_to_zero(a STATUS_VAR);
if (v < 0) {
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
}
-#ifdef FLOATX80
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
{
flag aSign;
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
aSign, aExp, aSig, 0 STATUS_VAR );
}
-#endif
-#ifdef FLOAT128
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
{
flag aSign;
STATUS_VAR );
}
-#endif