4 * Derived from SoftFloat.
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
43 #include "fpu/softfloat.h"
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations. (Can be specialized to target if
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine: (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output. These details are target-
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
62 void set_float_rounding_mode(int val STATUS_PARAM)
64 STATUS(float_rounding_mode) = val;
67 void set_float_exception_flags(int val STATUS_PARAM)
69 STATUS(float_exception_flags) = val;
72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
74 STATUS(floatx80_rounding_precision) = val;
77 /*----------------------------------------------------------------------------
78 | Returns the fraction bits of the half-precision floating-point value `a'.
79 *----------------------------------------------------------------------------*/
81 INLINE uint32_t extractFloat16Frac(float16 a)
83 return float16_val(a) & 0x3ff;
86 /*----------------------------------------------------------------------------
87 | Returns the exponent bits of the half-precision floating-point value `a'.
88 *----------------------------------------------------------------------------*/
90 INLINE int_fast16_t extractFloat16Exp(float16 a)
92 return (float16_val(a) >> 10) & 0x1f;
95 /*----------------------------------------------------------------------------
96 | Returns the sign bit of the single-precision floating-point value `a'.
97 *----------------------------------------------------------------------------*/
99 INLINE flag extractFloat16Sign(float16 a)
101 return float16_val(a)>>15;
104 /*----------------------------------------------------------------------------
105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
107 | input. If `zSign' is 1, the input is negated before being converted to an
108 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109 | is simply rounded to an integer, with the inexact exception raised if the
110 | input cannot be represented exactly as an integer. However, if the fixed-
111 | point input is too large, the invalid exception is raised and the largest
112 | positive or negative integer is returned.
113 *----------------------------------------------------------------------------*/
115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
130 roundIncrement = 0x7F;
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
153 /*----------------------------------------------------------------------------
154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
156 | and returns the properly rounded 64-bit integer corresponding to the input.
157 | If `zSign' is 1, the input is negated before being converted to an integer.
158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
159 | the inexact exception raised if the input cannot be represented exactly as
160 | an integer. However, if the fixed-point input is too large, the invalid
161 | exception is raised and the largest positive or negative integer is
163 *----------------------------------------------------------------------------*/
165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
168 flag roundNearestEven, increment;
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
173 increment = ( (int64_t) absZ1 < 0 );
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
183 increment = ( roundingMode == float_round_up ) && absZ1;
189 if ( absZ0 == 0 ) goto overflow;
190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
196 float_raise( float_flag_invalid STATUS_VAR);
198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
206 /*----------------------------------------------------------------------------
207 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
208 | `absZ1', with binary point between bits 63 and 64 (between the input words),
209 | and returns the properly rounded 64-bit unsigned integer corresponding to the
210 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
211 | with the inexact exception raised if the input cannot be represented exactly
212 | as an integer. However, if the fixed-point input is too large, the invalid
213 | exception is raised and the largest unsigned integer is returned.
214 *----------------------------------------------------------------------------*/
216 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
217 uint64_t absZ1 STATUS_PARAM)
220 flag roundNearestEven, increment;
222 roundingMode = STATUS(float_rounding_mode);
223 roundNearestEven = (roundingMode == float_round_nearest_even);
224 increment = ((int64_t)absZ1 < 0);
225 if (!roundNearestEven) {
226 if (roundingMode == float_round_to_zero) {
230 increment = (roundingMode == float_round_down) && absZ1;
232 increment = (roundingMode == float_round_up) && absZ1;
239 float_raise(float_flag_invalid STATUS_VAR);
240 return LIT64(0xFFFFFFFFFFFFFFFF);
242 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
245 if (zSign && absZ0) {
246 float_raise(float_flag_invalid STATUS_VAR);
251 STATUS(float_exception_flags) |= float_flag_inexact;
256 /*----------------------------------------------------------------------------
257 | Returns the fraction bits of the single-precision floating-point value `a'.
258 *----------------------------------------------------------------------------*/
260 INLINE uint32_t extractFloat32Frac( float32 a )
263 return float32_val(a) & 0x007FFFFF;
267 /*----------------------------------------------------------------------------
268 | Returns the exponent bits of the single-precision floating-point value `a'.
269 *----------------------------------------------------------------------------*/
271 INLINE int_fast16_t extractFloat32Exp(float32 a)
274 return ( float32_val(a)>>23 ) & 0xFF;
278 /*----------------------------------------------------------------------------
279 | Returns the sign bit of the single-precision floating-point value `a'.
280 *----------------------------------------------------------------------------*/
282 INLINE flag extractFloat32Sign( float32 a )
285 return float32_val(a)>>31;
289 /*----------------------------------------------------------------------------
290 | If `a' is denormal and we are in flush-to-zero mode then set the
291 | input-denormal exception and return zero. Otherwise just return the value.
292 *----------------------------------------------------------------------------*/
293 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
295 if (STATUS(flush_inputs_to_zero)) {
296 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
297 float_raise(float_flag_input_denormal STATUS_VAR);
298 return make_float32(float32_val(a) & 0x80000000);
304 /*----------------------------------------------------------------------------
305 | Normalizes the subnormal single-precision floating-point value represented
306 | by the denormalized significand `aSig'. The normalized exponent and
307 | significand are stored at the locations pointed to by `zExpPtr' and
308 | `zSigPtr', respectively.
309 *----------------------------------------------------------------------------*/
312 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
316 shiftCount = countLeadingZeros32( aSig ) - 8;
317 *zSigPtr = aSig<<shiftCount;
318 *zExpPtr = 1 - shiftCount;
322 /*----------------------------------------------------------------------------
323 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
324 | single-precision floating-point value, returning the result. After being
325 | shifted into the proper positions, the three fields are simply added
326 | together to form the result. This means that any integer portion of `zSig'
327 | will be added into the exponent. Since a properly normalized significand
328 | will have an integer portion equal to 1, the `zExp' input should be 1 less
329 | than the desired result exponent whenever `zSig' is a complete, normalized
331 *----------------------------------------------------------------------------*/
333 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
337 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
341 /*----------------------------------------------------------------------------
342 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
343 | and significand `zSig', and returns the proper single-precision floating-
344 | point value corresponding to the abstract input. Ordinarily, the abstract
345 | value is simply rounded and packed into the single-precision format, with
346 | the inexact exception raised if the abstract input cannot be represented
347 | exactly. However, if the abstract value is too large, the overflow and
348 | inexact exceptions are raised and an infinity or maximal finite value is
349 | returned. If the abstract value is too small, the input value is rounded to
350 | a subnormal number, and the underflow and inexact exceptions are raised if
351 | the abstract input cannot be represented exactly as a subnormal single-
352 | precision floating-point number.
353 | The input significand `zSig' has its binary point between bits 30
354 | and 29, which is 7 bits to the left of the usual location. This shifted
355 | significand must be normalized or smaller. If `zSig' is not normalized,
356 | `zExp' must be 0; in that case, the result returned is a subnormal number,
357 | and it must not require rounding. In the usual case that `zSig' is
358 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
359 | The handling of underflow and overflow follows the IEC/IEEE Standard for
360 | Binary Floating-Point Arithmetic.
361 *----------------------------------------------------------------------------*/
363 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
366 flag roundNearestEven;
367 int8 roundIncrement, roundBits;
370 roundingMode = STATUS(float_rounding_mode);
371 roundNearestEven = ( roundingMode == float_round_nearest_even );
372 roundIncrement = 0x40;
373 if ( ! roundNearestEven ) {
374 if ( roundingMode == float_round_to_zero ) {
378 roundIncrement = 0x7F;
380 if ( roundingMode == float_round_up ) roundIncrement = 0;
383 if ( roundingMode == float_round_down ) roundIncrement = 0;
387 roundBits = zSig & 0x7F;
388 if ( 0xFD <= (uint16_t) zExp ) {
390 || ( ( zExp == 0xFD )
391 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
394 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
397 if (STATUS(flush_to_zero)) {
398 float_raise(float_flag_output_denormal STATUS_VAR);
399 return packFloat32(zSign, 0, 0);
402 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404 || ( zSig + roundIncrement < 0x80000000 );
405 shift32RightJamming( zSig, - zExp, &zSig );
407 roundBits = zSig & 0x7F;
408 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
411 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
412 zSig = ( zSig + roundIncrement )>>7;
413 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
414 if ( zSig == 0 ) zExp = 0;
415 return packFloat32( zSign, zExp, zSig );
419 /*----------------------------------------------------------------------------
420 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
421 | and significand `zSig', and returns the proper single-precision floating-
422 | point value corresponding to the abstract input. This routine is just like
423 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
424 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
425 | floating-point exponent.
426 *----------------------------------------------------------------------------*/
429 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
433 shiftCount = countLeadingZeros32( zSig ) - 1;
434 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
438 /*----------------------------------------------------------------------------
439 | Returns the fraction bits of the double-precision floating-point value `a'.
440 *----------------------------------------------------------------------------*/
442 INLINE uint64_t extractFloat64Frac( float64 a )
445 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
449 /*----------------------------------------------------------------------------
450 | Returns the exponent bits of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 INLINE int_fast16_t extractFloat64Exp(float64 a)
456 return ( float64_val(a)>>52 ) & 0x7FF;
460 /*----------------------------------------------------------------------------
461 | Returns the sign bit of the double-precision floating-point value `a'.
462 *----------------------------------------------------------------------------*/
464 INLINE flag extractFloat64Sign( float64 a )
467 return float64_val(a)>>63;
471 /*----------------------------------------------------------------------------
472 | If `a' is denormal and we are in flush-to-zero mode then set the
473 | input-denormal exception and return zero. Otherwise just return the value.
474 *----------------------------------------------------------------------------*/
475 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477 if (STATUS(flush_inputs_to_zero)) {
478 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
479 float_raise(float_flag_input_denormal STATUS_VAR);
480 return make_float64(float64_val(a) & (1ULL << 63));
486 /*----------------------------------------------------------------------------
487 | Normalizes the subnormal double-precision floating-point value represented
488 | by the denormalized significand `aSig'. The normalized exponent and
489 | significand are stored at the locations pointed to by `zExpPtr' and
490 | `zSigPtr', respectively.
491 *----------------------------------------------------------------------------*/
494 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
498 shiftCount = countLeadingZeros64( aSig ) - 11;
499 *zSigPtr = aSig<<shiftCount;
500 *zExpPtr = 1 - shiftCount;
504 /*----------------------------------------------------------------------------
505 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
506 | double-precision floating-point value, returning the result. After being
507 | shifted into the proper positions, the three fields are simply added
508 | together to form the result. This means that any integer portion of `zSig'
509 | will be added into the exponent. Since a properly normalized significand
510 | will have an integer portion equal to 1, the `zExp' input should be 1 less
511 | than the desired result exponent whenever `zSig' is a complete, normalized
513 *----------------------------------------------------------------------------*/
515 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
519 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
523 /*----------------------------------------------------------------------------
524 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
525 | and significand `zSig', and returns the proper double-precision floating-
526 | point value corresponding to the abstract input. Ordinarily, the abstract
527 | value is simply rounded and packed into the double-precision format, with
528 | the inexact exception raised if the abstract input cannot be represented
529 | exactly. However, if the abstract value is too large, the overflow and
530 | inexact exceptions are raised and an infinity or maximal finite value is
531 | returned. If the abstract value is too small, the input value is rounded
532 | to a subnormal number, and the underflow and inexact exceptions are raised
533 | if the abstract input cannot be represented exactly as a subnormal double-
534 | precision floating-point number.
535 | The input significand `zSig' has its binary point between bits 62
536 | and 61, which is 10 bits to the left of the usual location. This shifted
537 | significand must be normalized or smaller. If `zSig' is not normalized,
538 | `zExp' must be 0; in that case, the result returned is a subnormal number,
539 | and it must not require rounding. In the usual case that `zSig' is
540 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
541 | The handling of underflow and overflow follows the IEC/IEEE Standard for
542 | Binary Floating-Point Arithmetic.
543 *----------------------------------------------------------------------------*/
545 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
548 flag roundNearestEven;
549 int_fast16_t roundIncrement, roundBits;
552 roundingMode = STATUS(float_rounding_mode);
553 roundNearestEven = ( roundingMode == float_round_nearest_even );
554 roundIncrement = 0x200;
555 if ( ! roundNearestEven ) {
556 if ( roundingMode == float_round_to_zero ) {
560 roundIncrement = 0x3FF;
562 if ( roundingMode == float_round_up ) roundIncrement = 0;
565 if ( roundingMode == float_round_down ) roundIncrement = 0;
569 roundBits = zSig & 0x3FF;
570 if ( 0x7FD <= (uint16_t) zExp ) {
571 if ( ( 0x7FD < zExp )
572 || ( ( zExp == 0x7FD )
573 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
575 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
576 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
579 if (STATUS(flush_to_zero)) {
580 float_raise(float_flag_output_denormal STATUS_VAR);
581 return packFloat64(zSign, 0, 0);
584 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
586 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
587 shift64RightJamming( zSig, - zExp, &zSig );
589 roundBits = zSig & 0x3FF;
590 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
593 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
594 zSig = ( zSig + roundIncrement )>>10;
595 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
596 if ( zSig == 0 ) zExp = 0;
597 return packFloat64( zSign, zExp, zSig );
601 /*----------------------------------------------------------------------------
602 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
603 | and significand `zSig', and returns the proper double-precision floating-
604 | point value corresponding to the abstract input. This routine is just like
605 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
606 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
607 | floating-point exponent.
608 *----------------------------------------------------------------------------*/
611 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615 shiftCount = countLeadingZeros64( zSig ) - 1;
616 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620 /*----------------------------------------------------------------------------
621 | Returns the fraction bits of the extended double-precision floating-point
623 *----------------------------------------------------------------------------*/
625 INLINE uint64_t extractFloatx80Frac( floatx80 a )
632 /*----------------------------------------------------------------------------
633 | Returns the exponent bits of the extended double-precision floating-point
635 *----------------------------------------------------------------------------*/
637 INLINE int32 extractFloatx80Exp( floatx80 a )
640 return a.high & 0x7FFF;
644 /*----------------------------------------------------------------------------
645 | Returns the sign bit of the extended double-precision floating-point value
647 *----------------------------------------------------------------------------*/
649 INLINE flag extractFloatx80Sign( floatx80 a )
656 /*----------------------------------------------------------------------------
657 | Normalizes the subnormal extended double-precision floating-point value
658 | represented by the denormalized significand `aSig'. The normalized exponent
659 | and significand are stored at the locations pointed to by `zExpPtr' and
660 | `zSigPtr', respectively.
661 *----------------------------------------------------------------------------*/
664 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668 shiftCount = countLeadingZeros64( aSig );
669 *zSigPtr = aSig<<shiftCount;
670 *zExpPtr = 1 - shiftCount;
674 /*----------------------------------------------------------------------------
675 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
676 | extended double-precision floating-point value, returning the result.
677 *----------------------------------------------------------------------------*/
679 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
684 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
689 /*----------------------------------------------------------------------------
690 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
691 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
692 | and returns the proper extended double-precision floating-point value
693 | corresponding to the abstract input. Ordinarily, the abstract value is
694 | rounded and packed into the extended double-precision format, with the
695 | inexact exception raised if the abstract input cannot be represented
696 | exactly. However, if the abstract value is too large, the overflow and
697 | inexact exceptions are raised and an infinity or maximal finite value is
698 | returned. If the abstract value is too small, the input value is rounded to
699 | a subnormal number, and the underflow and inexact exceptions are raised if
700 | the abstract input cannot be represented exactly as a subnormal extended
701 | double-precision floating-point number.
702 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
703 | number of bits as single or double precision, respectively. Otherwise, the
704 | result is rounded to the full precision of the extended double-precision
706 | The input significand must be normalized or smaller. If the input
707 | significand is not normalized, `zExp' must be 0; in that case, the result
708 | returned is a subnormal number, and it must not require rounding. The
709 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
710 | Floating-Point Arithmetic.
711 *----------------------------------------------------------------------------*/
714 roundAndPackFloatx80(
715 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719 flag roundNearestEven, increment, isTiny;
720 int64 roundIncrement, roundMask, roundBits;
722 roundingMode = STATUS(float_rounding_mode);
723 roundNearestEven = ( roundingMode == float_round_nearest_even );
724 if ( roundingPrecision == 80 ) goto precision80;
725 if ( roundingPrecision == 64 ) {
726 roundIncrement = LIT64( 0x0000000000000400 );
727 roundMask = LIT64( 0x00000000000007FF );
729 else if ( roundingPrecision == 32 ) {
730 roundIncrement = LIT64( 0x0000008000000000 );
731 roundMask = LIT64( 0x000000FFFFFFFFFF );
736 zSig0 |= ( zSig1 != 0 );
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
742 roundIncrement = roundMask;
744 if ( roundingMode == float_round_up ) roundIncrement = 0;
747 if ( roundingMode == float_round_down ) roundIncrement = 0;
751 roundBits = zSig0 & roundMask;
752 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
753 if ( ( 0x7FFE < zExp )
754 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
759 if (STATUS(flush_to_zero)) {
760 float_raise(float_flag_output_denormal STATUS_VAR);
761 return packFloatx80(zSign, 0, 0);
764 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766 || ( zSig0 <= zSig0 + roundIncrement );
767 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
769 roundBits = zSig0 & roundMask;
770 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
771 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
772 zSig0 += roundIncrement;
773 if ( (int64_t) zSig0 < 0 ) zExp = 1;
774 roundIncrement = roundMask + 1;
775 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
776 roundMask |= roundIncrement;
778 zSig0 &= ~ roundMask;
779 return packFloatx80( zSign, zExp, zSig0 );
782 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
783 zSig0 += roundIncrement;
784 if ( zSig0 < roundIncrement ) {
786 zSig0 = LIT64( 0x8000000000000000 );
788 roundIncrement = roundMask + 1;
789 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
790 roundMask |= roundIncrement;
792 zSig0 &= ~ roundMask;
793 if ( zSig0 == 0 ) zExp = 0;
794 return packFloatx80( zSign, zExp, zSig0 );
796 increment = ( (int64_t) zSig1 < 0 );
797 if ( ! roundNearestEven ) {
798 if ( roundingMode == float_round_to_zero ) {
803 increment = ( roundingMode == float_round_down ) && zSig1;
806 increment = ( roundingMode == float_round_up ) && zSig1;
810 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
811 if ( ( 0x7FFE < zExp )
812 || ( ( zExp == 0x7FFE )
813 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
819 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
820 if ( ( roundingMode == float_round_to_zero )
821 || ( zSign && ( roundingMode == float_round_up ) )
822 || ( ! zSign && ( roundingMode == float_round_down ) )
824 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
826 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
830 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
833 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
834 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
836 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
837 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
838 if ( roundNearestEven ) {
839 increment = ( (int64_t) zSig1 < 0 );
843 increment = ( roundingMode == float_round_down ) && zSig1;
846 increment = ( roundingMode == float_round_up ) && zSig1;
852 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
853 if ( (int64_t) zSig0 < 0 ) zExp = 1;
855 return packFloatx80( zSign, zExp, zSig0 );
858 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
863 zSig0 = LIT64( 0x8000000000000000 );
866 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
870 if ( zSig0 == 0 ) zExp = 0;
872 return packFloatx80( zSign, zExp, zSig0 );
876 /*----------------------------------------------------------------------------
877 | Takes an abstract floating-point value having sign `zSign', exponent
878 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
879 | and returns the proper extended double-precision floating-point value
880 | corresponding to the abstract input. This routine is just like
881 | `roundAndPackFloatx80' except that the input significand does not have to be
883 *----------------------------------------------------------------------------*/
886 normalizeRoundAndPackFloatx80(
887 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
897 shiftCount = countLeadingZeros64( zSig0 );
898 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
901 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
905 /*----------------------------------------------------------------------------
906 | Returns the least-significant 64 fraction bits of the quadruple-precision
907 | floating-point value `a'.
908 *----------------------------------------------------------------------------*/
910 INLINE uint64_t extractFloat128Frac1( float128 a )
917 /*----------------------------------------------------------------------------
918 | Returns the most-significant 48 fraction bits of the quadruple-precision
919 | floating-point value `a'.
920 *----------------------------------------------------------------------------*/
922 INLINE uint64_t extractFloat128Frac0( float128 a )
925 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
929 /*----------------------------------------------------------------------------
930 | Returns the exponent bits of the quadruple-precision floating-point value
932 *----------------------------------------------------------------------------*/
934 INLINE int32 extractFloat128Exp( float128 a )
937 return ( a.high>>48 ) & 0x7FFF;
941 /*----------------------------------------------------------------------------
942 | Returns the sign bit of the quadruple-precision floating-point value `a'.
943 *----------------------------------------------------------------------------*/
945 INLINE flag extractFloat128Sign( float128 a )
952 /*----------------------------------------------------------------------------
953 | Normalizes the subnormal quadruple-precision floating-point value
954 | represented by the denormalized significand formed by the concatenation of
955 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
956 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
957 | significand are stored at the location pointed to by `zSig0Ptr', and the
958 | least significant 64 bits of the normalized significand are stored at the
959 | location pointed to by `zSig1Ptr'.
960 *----------------------------------------------------------------------------*/
963 normalizeFloat128Subnormal(
974 shiftCount = countLeadingZeros64( aSig1 ) - 15;
975 if ( shiftCount < 0 ) {
976 *zSig0Ptr = aSig1>>( - shiftCount );
977 *zSig1Ptr = aSig1<<( shiftCount & 63 );
980 *zSig0Ptr = aSig1<<shiftCount;
983 *zExpPtr = - shiftCount - 63;
986 shiftCount = countLeadingZeros64( aSig0 ) - 15;
987 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
988 *zExpPtr = 1 - shiftCount;
993 /*----------------------------------------------------------------------------
994 | Packs the sign `zSign', the exponent `zExp', and the significand formed
995 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
996 | floating-point value, returning the result. After being shifted into the
997 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
998 | added together to form the most significant 32 bits of the result. This
999 | means that any integer portion of `zSig0' will be added into the exponent.
1000 | Since a properly normalized significand will have an integer portion equal
1001 | to 1, the `zExp' input should be 1 less than the desired result exponent
1002 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1004 *----------------------------------------------------------------------------*/
1007 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1012 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1017 /*----------------------------------------------------------------------------
1018 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1020 | and `zSig2', and returns the proper quadruple-precision floating-point value
1021 | corresponding to the abstract input. Ordinarily, the abstract value is
1022 | simply rounded and packed into the quadruple-precision format, with the
1023 | inexact exception raised if the abstract input cannot be represented
1024 | exactly. However, if the abstract value is too large, the overflow and
1025 | inexact exceptions are raised and an infinity or maximal finite value is
1026 | returned. If the abstract value is too small, the input value is rounded to
1027 | a subnormal number, and the underflow and inexact exceptions are raised if
1028 | the abstract input cannot be represented exactly as a subnormal quadruple-
1029 | precision floating-point number.
1030 | The input significand must be normalized or smaller. If the input
1031 | significand is not normalized, `zExp' must be 0; in that case, the result
1032 | returned is a subnormal number, and it must not require rounding. In the
1033 | usual case that the input significand is normalized, `zExp' must be 1 less
1034 | than the ``true'' floating-point exponent. The handling of underflow and
1035 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036 *----------------------------------------------------------------------------*/
1039 roundAndPackFloat128(
1040 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1043 flag roundNearestEven, increment, isTiny;
1045 roundingMode = STATUS(float_rounding_mode);
1046 roundNearestEven = ( roundingMode == float_round_nearest_even );
1047 increment = ( (int64_t) zSig2 < 0 );
1048 if ( ! roundNearestEven ) {
1049 if ( roundingMode == float_round_to_zero ) {
1054 increment = ( roundingMode == float_round_down ) && zSig2;
1057 increment = ( roundingMode == float_round_up ) && zSig2;
1061 if ( 0x7FFD <= (uint32_t) zExp ) {
1062 if ( ( 0x7FFD < zExp )
1063 || ( ( zExp == 0x7FFD )
1065 LIT64( 0x0001FFFFFFFFFFFF ),
1066 LIT64( 0xFFFFFFFFFFFFFFFF ),
1073 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1074 if ( ( roundingMode == float_round_to_zero )
1075 || ( zSign && ( roundingMode == float_round_up ) )
1076 || ( ! zSign && ( roundingMode == float_round_down ) )
1082 LIT64( 0x0000FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF )
1086 return packFloat128( zSign, 0x7FFF, 0, 0 );
1089 if (STATUS(flush_to_zero)) {
1090 float_raise(float_flag_output_denormal STATUS_VAR);
1091 return packFloat128(zSign, 0, 0, 0);
1094 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1100 LIT64( 0x0001FFFFFFFFFFFF ),
1101 LIT64( 0xFFFFFFFFFFFFFFFF )
1103 shift128ExtraRightJamming(
1104 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1106 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1107 if ( roundNearestEven ) {
1108 increment = ( (int64_t) zSig2 < 0 );
1112 increment = ( roundingMode == float_round_down ) && zSig2;
1115 increment = ( roundingMode == float_round_up ) && zSig2;
1120 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1122 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1123 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1126 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1128 return packFloat128( zSign, zExp, zSig0, zSig1 );
1132 /*----------------------------------------------------------------------------
1133 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1134 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1135 | returns the proper quadruple-precision floating-point value corresponding
1136 | to the abstract input. This routine is just like `roundAndPackFloat128'
1137 | except that the input significand has fewer bits and does not have to be
1138 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1140 *----------------------------------------------------------------------------*/
1143 normalizeRoundAndPackFloat128(
1144 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1154 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1155 if ( 0 <= shiftCount ) {
1157 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1160 shift128ExtraRightJamming(
1161 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1164 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1168 /*----------------------------------------------------------------------------
1169 | Returns the result of converting the 32-bit two's complement integer `a'
1170 | to the single-precision floating-point format. The conversion is performed
1171 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172 *----------------------------------------------------------------------------*/
1174 float32 int32_to_float32(int32_t a STATUS_PARAM)
1178 if ( a == 0 ) return float32_zero;
1179 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1181 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1185 /*----------------------------------------------------------------------------
1186 | Returns the result of converting the 32-bit two's complement integer `a'
1187 | to the double-precision floating-point format. The conversion is performed
1188 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189 *----------------------------------------------------------------------------*/
1191 float64 int32_to_float64(int32_t a STATUS_PARAM)
1198 if ( a == 0 ) return float64_zero;
1200 absA = zSign ? - a : a;
1201 shiftCount = countLeadingZeros32( absA ) + 21;
1203 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1207 /*----------------------------------------------------------------------------
1208 | Returns the result of converting the 32-bit two's complement integer `a'
1209 | to the extended double-precision floating-point format. The conversion
1210 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1212 *----------------------------------------------------------------------------*/
1214 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1221 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1223 absA = zSign ? - a : a;
1224 shiftCount = countLeadingZeros32( absA ) + 32;
1226 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a' to
1232 | the quadruple-precision floating-point format. The conversion is performed
1233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1234 *----------------------------------------------------------------------------*/
1236 float128 int32_to_float128(int32_t a STATUS_PARAM)
1243 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1245 absA = zSign ? - a : a;
1246 shiftCount = countLeadingZeros32( absA ) + 17;
1248 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1252 /*----------------------------------------------------------------------------
1253 | Returns the result of converting the 64-bit two's complement integer `a'
1254 | to the single-precision floating-point format. The conversion is performed
1255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 *----------------------------------------------------------------------------*/
1258 float32 int64_to_float32(int64_t a STATUS_PARAM)
1264 if ( a == 0 ) return float32_zero;
1266 absA = zSign ? - a : a;
1267 shiftCount = countLeadingZeros64( absA ) - 40;
1268 if ( 0 <= shiftCount ) {
1269 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1273 if ( shiftCount < 0 ) {
1274 shift64RightJamming( absA, - shiftCount, &absA );
1277 absA <<= shiftCount;
1279 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1284 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1288 if ( a == 0 ) return float32_zero;
1289 shiftCount = countLeadingZeros64( a ) - 40;
1290 if ( 0 <= shiftCount ) {
1291 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1295 if ( shiftCount < 0 ) {
1296 shift64RightJamming( a, - shiftCount, &a );
1301 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1305 /*----------------------------------------------------------------------------
1306 | Returns the result of converting the 64-bit two's complement integer `a'
1307 | to the double-precision floating-point format. The conversion is performed
1308 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1309 *----------------------------------------------------------------------------*/
1311 float64 int64_to_float64(int64_t a STATUS_PARAM)
1315 if ( a == 0 ) return float64_zero;
1316 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1317 return packFloat64( 1, 0x43E, 0 );
1320 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1324 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1329 return float64_zero;
1331 if ((int64_t)a < 0) {
1332 shift64RightJamming(a, 1, &a);
1335 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1338 /*----------------------------------------------------------------------------
1339 | Returns the result of converting the 64-bit two's complement integer `a'
1340 | to the extended double-precision floating-point format. The conversion
1341 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 *----------------------------------------------------------------------------*/
1345 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1351 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1353 absA = zSign ? - a : a;
1354 shiftCount = countLeadingZeros64( absA );
1355 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1359 /*----------------------------------------------------------------------------
1360 | Returns the result of converting the 64-bit two's complement integer `a' to
1361 | the quadruple-precision floating-point format. The conversion is performed
1362 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1363 *----------------------------------------------------------------------------*/
1365 float128 int64_to_float128(int64_t a STATUS_PARAM)
1371 uint64_t zSig0, zSig1;
1373 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1375 absA = zSign ? - a : a;
1376 shiftCount = countLeadingZeros64( absA ) + 49;
1377 zExp = 0x406E - shiftCount;
1378 if ( 64 <= shiftCount ) {
1387 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1388 return packFloat128( zSign, zExp, zSig0, zSig1 );
1392 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1395 return float128_zero;
1397 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1400 /*----------------------------------------------------------------------------
1401 | Returns the result of converting the single-precision floating-point value
1402 | `a' to the 32-bit two's complement integer format. The conversion is
1403 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1404 | Arithmetic---which means in particular that the conversion is rounded
1405 | according to the current rounding mode. If `a' is a NaN, the largest
1406 | positive integer is returned. Otherwise, if the conversion overflows, the
1407 | largest integer with the same sign as `a' is returned.
1408 *----------------------------------------------------------------------------*/
1410 int32 float32_to_int32( float32 a STATUS_PARAM )
1413 int_fast16_t aExp, shiftCount;
1417 a = float32_squash_input_denormal(a STATUS_VAR);
1418 aSig = extractFloat32Frac( a );
1419 aExp = extractFloat32Exp( a );
1420 aSign = extractFloat32Sign( a );
1421 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1422 if ( aExp ) aSig |= 0x00800000;
1423 shiftCount = 0xAF - aExp;
1426 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1427 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1431 /*----------------------------------------------------------------------------
1432 | Returns the result of converting the single-precision floating-point value
1433 | `a' to the 32-bit two's complement integer format. The conversion is
1434 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1435 | Arithmetic, except that the conversion is always rounded toward zero.
1436 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1437 | the conversion overflows, the largest integer with the same sign as `a' is
1439 *----------------------------------------------------------------------------*/
1441 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1444 int_fast16_t aExp, shiftCount;
1447 a = float32_squash_input_denormal(a STATUS_VAR);
1449 aSig = extractFloat32Frac( a );
1450 aExp = extractFloat32Exp( a );
1451 aSign = extractFloat32Sign( a );
1452 shiftCount = aExp - 0x9E;
1453 if ( 0 <= shiftCount ) {
1454 if ( float32_val(a) != 0xCF000000 ) {
1455 float_raise( float_flag_invalid STATUS_VAR);
1456 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1458 return (int32_t) 0x80000000;
1460 else if ( aExp <= 0x7E ) {
1461 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1464 aSig = ( aSig | 0x00800000 )<<8;
1465 z = aSig>>( - shiftCount );
1466 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1467 STATUS(float_exception_flags) |= float_flag_inexact;
1469 if ( aSign ) z = - z;
1474 /*----------------------------------------------------------------------------
1475 | Returns the result of converting the single-precision floating-point value
1476 | `a' to the 16-bit two's complement integer format. The conversion is
1477 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1478 | Arithmetic, except that the conversion is always rounded toward zero.
1479 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1480 | the conversion overflows, the largest integer with the same sign as `a' is
1482 *----------------------------------------------------------------------------*/
1484 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1487 int_fast16_t aExp, shiftCount;
1491 aSig = extractFloat32Frac( a );
1492 aExp = extractFloat32Exp( a );
1493 aSign = extractFloat32Sign( a );
1494 shiftCount = aExp - 0x8E;
1495 if ( 0 <= shiftCount ) {
1496 if ( float32_val(a) != 0xC7000000 ) {
1497 float_raise( float_flag_invalid STATUS_VAR);
1498 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1502 return (int32_t) 0xffff8000;
1504 else if ( aExp <= 0x7E ) {
1505 if ( aExp | aSig ) {
1506 STATUS(float_exception_flags) |= float_flag_inexact;
1511 aSig = ( aSig | 0x00800000 )<<8;
1512 z = aSig>>( - shiftCount );
1513 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1514 STATUS(float_exception_flags) |= float_flag_inexact;
1523 /*----------------------------------------------------------------------------
1524 | Returns the result of converting the single-precision floating-point value
1525 | `a' to the 64-bit two's complement integer format. The conversion is
1526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1527 | Arithmetic---which means in particular that the conversion is rounded
1528 | according to the current rounding mode. If `a' is a NaN, the largest
1529 | positive integer is returned. Otherwise, if the conversion overflows, the
1530 | largest integer with the same sign as `a' is returned.
1531 *----------------------------------------------------------------------------*/
1533 int64 float32_to_int64( float32 a STATUS_PARAM )
1536 int_fast16_t aExp, shiftCount;
1538 uint64_t aSig64, aSigExtra;
1539 a = float32_squash_input_denormal(a STATUS_VAR);
1541 aSig = extractFloat32Frac( a );
1542 aExp = extractFloat32Exp( a );
1543 aSign = extractFloat32Sign( a );
1544 shiftCount = 0xBE - aExp;
1545 if ( shiftCount < 0 ) {
1546 float_raise( float_flag_invalid STATUS_VAR);
1547 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1548 return LIT64( 0x7FFFFFFFFFFFFFFF );
1550 return (int64_t) LIT64( 0x8000000000000000 );
1552 if ( aExp ) aSig |= 0x00800000;
1555 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1556 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1560 /*----------------------------------------------------------------------------
1561 | Returns the result of converting the single-precision floating-point value
1562 | `a' to the 64-bit unsigned integer format. The conversion is
1563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1564 | Arithmetic---which means in particular that the conversion is rounded
1565 | according to the current rounding mode. If `a' is a NaN, the largest
1566 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1567 | largest unsigned integer is returned. If the 'a' is negative, the result
1568 | is rounded and zero is returned; values that do not round to zero will
1569 | raise the inexact exception flag.
1570 *----------------------------------------------------------------------------*/
1572 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1575 int_fast16_t aExp, shiftCount;
1577 uint64_t aSig64, aSigExtra;
1578 a = float32_squash_input_denormal(a STATUS_VAR);
1580 aSig = extractFloat32Frac(a);
1581 aExp = extractFloat32Exp(a);
1582 aSign = extractFloat32Sign(a);
1583 if ((aSign) && (aExp > 126)) {
1584 float_raise(float_flag_invalid STATUS_VAR);
1585 if (float32_is_any_nan(a)) {
1586 return LIT64(0xFFFFFFFFFFFFFFFF);
1591 shiftCount = 0xBE - aExp;
1595 if (shiftCount < 0) {
1596 float_raise(float_flag_invalid STATUS_VAR);
1597 return LIT64(0xFFFFFFFFFFFFFFFF);
1602 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1603 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1606 /*----------------------------------------------------------------------------
1607 | Returns the result of converting the single-precision floating-point value
1608 | `a' to the 64-bit two's complement integer format. The conversion is
1609 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1610 | Arithmetic, except that the conversion is always rounded toward zero. If
1611 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1612 | conversion overflows, the largest integer with the same sign as `a' is
1614 *----------------------------------------------------------------------------*/
1616 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1619 int_fast16_t aExp, shiftCount;
1623 a = float32_squash_input_denormal(a STATUS_VAR);
1625 aSig = extractFloat32Frac( a );
1626 aExp = extractFloat32Exp( a );
1627 aSign = extractFloat32Sign( a );
1628 shiftCount = aExp - 0xBE;
1629 if ( 0 <= shiftCount ) {
1630 if ( float32_val(a) != 0xDF000000 ) {
1631 float_raise( float_flag_invalid STATUS_VAR);
1632 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1633 return LIT64( 0x7FFFFFFFFFFFFFFF );
1636 return (int64_t) LIT64( 0x8000000000000000 );
1638 else if ( aExp <= 0x7E ) {
1639 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1642 aSig64 = aSig | 0x00800000;
1644 z = aSig64>>( - shiftCount );
1645 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1646 STATUS(float_exception_flags) |= float_flag_inexact;
1648 if ( aSign ) z = - z;
1653 /*----------------------------------------------------------------------------
1654 | Returns the result of converting the single-precision floating-point value
1655 | `a' to the double-precision floating-point format. The conversion is
1656 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1658 *----------------------------------------------------------------------------*/
1660 float64 float32_to_float64( float32 a STATUS_PARAM )
1665 a = float32_squash_input_denormal(a STATUS_VAR);
1667 aSig = extractFloat32Frac( a );
1668 aExp = extractFloat32Exp( a );
1669 aSign = extractFloat32Sign( a );
1670 if ( aExp == 0xFF ) {
1671 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1672 return packFloat64( aSign, 0x7FF, 0 );
1675 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1676 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1679 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1683 /*----------------------------------------------------------------------------
1684 | Returns the result of converting the single-precision floating-point value
1685 | `a' to the extended double-precision floating-point format. The conversion
1686 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1688 *----------------------------------------------------------------------------*/
1690 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1696 a = float32_squash_input_denormal(a STATUS_VAR);
1697 aSig = extractFloat32Frac( a );
1698 aExp = extractFloat32Exp( a );
1699 aSign = extractFloat32Sign( a );
1700 if ( aExp == 0xFF ) {
1701 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1702 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1705 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1706 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1709 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1713 /*----------------------------------------------------------------------------
1714 | Returns the result of converting the single-precision floating-point value
1715 | `a' to the double-precision floating-point format. The conversion is
1716 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1718 *----------------------------------------------------------------------------*/
1720 float128 float32_to_float128( float32 a STATUS_PARAM )
1726 a = float32_squash_input_denormal(a STATUS_VAR);
1727 aSig = extractFloat32Frac( a );
1728 aExp = extractFloat32Exp( a );
1729 aSign = extractFloat32Sign( a );
1730 if ( aExp == 0xFF ) {
1731 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1732 return packFloat128( aSign, 0x7FFF, 0, 0 );
1735 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1736 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1739 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1743 /*----------------------------------------------------------------------------
1744 | Rounds the single-precision floating-point value `a' to an integer, and
1745 | returns the result as a single-precision floating-point value. The
1746 | operation is performed according to the IEC/IEEE Standard for Binary
1747 | Floating-Point Arithmetic.
1748 *----------------------------------------------------------------------------*/
1750 float32 float32_round_to_int( float32 a STATUS_PARAM)
1754 uint32_t lastBitMask, roundBitsMask;
1757 a = float32_squash_input_denormal(a STATUS_VAR);
1759 aExp = extractFloat32Exp( a );
1760 if ( 0x96 <= aExp ) {
1761 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1762 return propagateFloat32NaN( a, a STATUS_VAR );
1766 if ( aExp <= 0x7E ) {
1767 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1768 STATUS(float_exception_flags) |= float_flag_inexact;
1769 aSign = extractFloat32Sign( a );
1770 switch ( STATUS(float_rounding_mode) ) {
1771 case float_round_nearest_even:
1772 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1773 return packFloat32( aSign, 0x7F, 0 );
1776 case float_round_down:
1777 return make_float32(aSign ? 0xBF800000 : 0);
1778 case float_round_up:
1779 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1781 return packFloat32( aSign, 0, 0 );
1784 lastBitMask <<= 0x96 - aExp;
1785 roundBitsMask = lastBitMask - 1;
1787 roundingMode = STATUS(float_rounding_mode);
1788 if ( roundingMode == float_round_nearest_even ) {
1789 z += lastBitMask>>1;
1790 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1792 else if ( roundingMode != float_round_to_zero ) {
1793 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1797 z &= ~ roundBitsMask;
1798 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1799 return make_float32(z);
1803 /*----------------------------------------------------------------------------
1804 | Returns the result of adding the absolute values of the single-precision
1805 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1806 | before being returned. `zSign' is ignored if the result is a NaN.
1807 | The addition is performed according to the IEC/IEEE Standard for Binary
1808 | Floating-Point Arithmetic.
1809 *----------------------------------------------------------------------------*/
1811 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1813 int_fast16_t aExp, bExp, zExp;
1814 uint32_t aSig, bSig, zSig;
1815 int_fast16_t expDiff;
1817 aSig = extractFloat32Frac( a );
1818 aExp = extractFloat32Exp( a );
1819 bSig = extractFloat32Frac( b );
1820 bExp = extractFloat32Exp( b );
1821 expDiff = aExp - bExp;
1824 if ( 0 < expDiff ) {
1825 if ( aExp == 0xFF ) {
1826 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1835 shift32RightJamming( bSig, expDiff, &bSig );
1838 else if ( expDiff < 0 ) {
1839 if ( bExp == 0xFF ) {
1840 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1841 return packFloat32( zSign, 0xFF, 0 );
1849 shift32RightJamming( aSig, - expDiff, &aSig );
1853 if ( aExp == 0xFF ) {
1854 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1858 if (STATUS(flush_to_zero)) {
1860 float_raise(float_flag_output_denormal STATUS_VAR);
1862 return packFloat32(zSign, 0, 0);
1864 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1866 zSig = 0x40000000 + aSig + bSig;
1871 zSig = ( aSig + bSig )<<1;
1873 if ( (int32_t) zSig < 0 ) {
1878 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1882 /*----------------------------------------------------------------------------
1883 | Returns the result of subtracting the absolute values of the single-
1884 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1885 | difference is negated before being returned. `zSign' is ignored if the
1886 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1887 | Standard for Binary Floating-Point Arithmetic.
1888 *----------------------------------------------------------------------------*/
1890 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1892 int_fast16_t aExp, bExp, zExp;
1893 uint32_t aSig, bSig, zSig;
1894 int_fast16_t expDiff;
1896 aSig = extractFloat32Frac( a );
1897 aExp = extractFloat32Exp( a );
1898 bSig = extractFloat32Frac( b );
1899 bExp = extractFloat32Exp( b );
1900 expDiff = aExp - bExp;
1903 if ( 0 < expDiff ) goto aExpBigger;
1904 if ( expDiff < 0 ) goto bExpBigger;
1905 if ( aExp == 0xFF ) {
1906 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1907 float_raise( float_flag_invalid STATUS_VAR);
1908 return float32_default_nan;
1914 if ( bSig < aSig ) goto aBigger;
1915 if ( aSig < bSig ) goto bBigger;
1916 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1918 if ( bExp == 0xFF ) {
1919 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1920 return packFloat32( zSign ^ 1, 0xFF, 0 );
1928 shift32RightJamming( aSig, - expDiff, &aSig );
1934 goto normalizeRoundAndPack;
1936 if ( aExp == 0xFF ) {
1937 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1946 shift32RightJamming( bSig, expDiff, &bSig );
1951 normalizeRoundAndPack:
1953 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1957 /*----------------------------------------------------------------------------
1958 | Returns the result of adding the single-precision floating-point values `a'
1959 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1960 | Binary Floating-Point Arithmetic.
1961 *----------------------------------------------------------------------------*/
1963 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1966 a = float32_squash_input_denormal(a STATUS_VAR);
1967 b = float32_squash_input_denormal(b STATUS_VAR);
1969 aSign = extractFloat32Sign( a );
1970 bSign = extractFloat32Sign( b );
1971 if ( aSign == bSign ) {
1972 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1975 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1980 /*----------------------------------------------------------------------------
1981 | Returns the result of subtracting the single-precision floating-point values
1982 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1983 | for Binary Floating-Point Arithmetic.
1984 *----------------------------------------------------------------------------*/
1986 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1989 a = float32_squash_input_denormal(a STATUS_VAR);
1990 b = float32_squash_input_denormal(b STATUS_VAR);
1992 aSign = extractFloat32Sign( a );
1993 bSign = extractFloat32Sign( b );
1994 if ( aSign == bSign ) {
1995 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1998 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2003 /*----------------------------------------------------------------------------
2004 | Returns the result of multiplying the single-precision floating-point values
2005 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2006 | for Binary Floating-Point Arithmetic.
2007 *----------------------------------------------------------------------------*/
2009 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2011 flag aSign, bSign, zSign;
2012 int_fast16_t aExp, bExp, zExp;
2013 uint32_t aSig, bSig;
2017 a = float32_squash_input_denormal(a STATUS_VAR);
2018 b = float32_squash_input_denormal(b STATUS_VAR);
2020 aSig = extractFloat32Frac( a );
2021 aExp = extractFloat32Exp( a );
2022 aSign = extractFloat32Sign( a );
2023 bSig = extractFloat32Frac( b );
2024 bExp = extractFloat32Exp( b );
2025 bSign = extractFloat32Sign( b );
2026 zSign = aSign ^ bSign;
2027 if ( aExp == 0xFF ) {
2028 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2029 return propagateFloat32NaN( a, b STATUS_VAR );
2031 if ( ( bExp | bSig ) == 0 ) {
2032 float_raise( float_flag_invalid STATUS_VAR);
2033 return float32_default_nan;
2035 return packFloat32( zSign, 0xFF, 0 );
2037 if ( bExp == 0xFF ) {
2038 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2039 if ( ( aExp | aSig ) == 0 ) {
2040 float_raise( float_flag_invalid STATUS_VAR);
2041 return float32_default_nan;
2043 return packFloat32( zSign, 0xFF, 0 );
2046 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2047 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2050 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2051 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2053 zExp = aExp + bExp - 0x7F;
2054 aSig = ( aSig | 0x00800000 )<<7;
2055 bSig = ( bSig | 0x00800000 )<<8;
2056 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2058 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2062 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2066 /*----------------------------------------------------------------------------
2067 | Returns the result of dividing the single-precision floating-point value `a'
2068 | by the corresponding value `b'. The operation is performed according to the
2069 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2070 *----------------------------------------------------------------------------*/
2072 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2074 flag aSign, bSign, zSign;
2075 int_fast16_t aExp, bExp, zExp;
2076 uint32_t aSig, bSig, zSig;
2077 a = float32_squash_input_denormal(a STATUS_VAR);
2078 b = float32_squash_input_denormal(b STATUS_VAR);
2080 aSig = extractFloat32Frac( a );
2081 aExp = extractFloat32Exp( a );
2082 aSign = extractFloat32Sign( a );
2083 bSig = extractFloat32Frac( b );
2084 bExp = extractFloat32Exp( b );
2085 bSign = extractFloat32Sign( b );
2086 zSign = aSign ^ bSign;
2087 if ( aExp == 0xFF ) {
2088 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2089 if ( bExp == 0xFF ) {
2090 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2091 float_raise( float_flag_invalid STATUS_VAR);
2092 return float32_default_nan;
2094 return packFloat32( zSign, 0xFF, 0 );
2096 if ( bExp == 0xFF ) {
2097 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2098 return packFloat32( zSign, 0, 0 );
2102 if ( ( aExp | aSig ) == 0 ) {
2103 float_raise( float_flag_invalid STATUS_VAR);
2104 return float32_default_nan;
2106 float_raise( float_flag_divbyzero STATUS_VAR);
2107 return packFloat32( zSign, 0xFF, 0 );
2109 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2112 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2113 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2115 zExp = aExp - bExp + 0x7D;
2116 aSig = ( aSig | 0x00800000 )<<7;
2117 bSig = ( bSig | 0x00800000 )<<8;
2118 if ( bSig <= ( aSig + aSig ) ) {
2122 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2123 if ( ( zSig & 0x3F ) == 0 ) {
2124 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2126 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2130 /*----------------------------------------------------------------------------
2131 | Returns the remainder of the single-precision floating-point value `a'
2132 | with respect to the corresponding value `b'. The operation is performed
2133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2134 *----------------------------------------------------------------------------*/
2136 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2139 int_fast16_t aExp, bExp, expDiff;
2140 uint32_t aSig, bSig;
2142 uint64_t aSig64, bSig64, q64;
2143 uint32_t alternateASig;
2145 a = float32_squash_input_denormal(a STATUS_VAR);
2146 b = float32_squash_input_denormal(b STATUS_VAR);
2148 aSig = extractFloat32Frac( a );
2149 aExp = extractFloat32Exp( a );
2150 aSign = extractFloat32Sign( a );
2151 bSig = extractFloat32Frac( b );
2152 bExp = extractFloat32Exp( b );
2153 if ( aExp == 0xFF ) {
2154 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2155 return propagateFloat32NaN( a, b STATUS_VAR );
2157 float_raise( float_flag_invalid STATUS_VAR);
2158 return float32_default_nan;
2160 if ( bExp == 0xFF ) {
2161 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2166 float_raise( float_flag_invalid STATUS_VAR);
2167 return float32_default_nan;
2169 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2172 if ( aSig == 0 ) return a;
2173 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2175 expDiff = aExp - bExp;
2178 if ( expDiff < 32 ) {
2181 if ( expDiff < 0 ) {
2182 if ( expDiff < -1 ) return a;
2185 q = ( bSig <= aSig );
2186 if ( q ) aSig -= bSig;
2187 if ( 0 < expDiff ) {
2188 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2191 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2199 if ( bSig <= aSig ) aSig -= bSig;
2200 aSig64 = ( (uint64_t) aSig )<<40;
2201 bSig64 = ( (uint64_t) bSig )<<40;
2203 while ( 0 < expDiff ) {
2204 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2205 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2206 aSig64 = - ( ( bSig * q64 )<<38 );
2210 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2211 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2212 q = q64>>( 64 - expDiff );
2214 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2217 alternateASig = aSig;
2220 } while ( 0 <= (int32_t) aSig );
2221 sigMean = aSig + alternateASig;
2222 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2223 aSig = alternateASig;
2225 zSign = ( (int32_t) aSig < 0 );
2226 if ( zSign ) aSig = - aSig;
2227 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2231 /*----------------------------------------------------------------------------
2232 | Returns the result of multiplying the single-precision floating-point values
2233 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2234 | multiplication. The operation is performed according to the IEC/IEEE
2235 | Standard for Binary Floating-Point Arithmetic 754-2008.
2236 | The flags argument allows the caller to select negation of the
2237 | addend, the intermediate product, or the final result. (The difference
2238 | between this and having the caller do a separate negation is that negating
2239 | externally will flip the sign bit on NaNs.)
2240 *----------------------------------------------------------------------------*/
2242 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2244 flag aSign, bSign, cSign, zSign;
2245 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2246 uint32_t aSig, bSig, cSig;
2247 flag pInf, pZero, pSign;
2248 uint64_t pSig64, cSig64, zSig64;
2251 flag signflip, infzero;
2253 a = float32_squash_input_denormal(a STATUS_VAR);
2254 b = float32_squash_input_denormal(b STATUS_VAR);
2255 c = float32_squash_input_denormal(c STATUS_VAR);
2256 aSig = extractFloat32Frac(a);
2257 aExp = extractFloat32Exp(a);
2258 aSign = extractFloat32Sign(a);
2259 bSig = extractFloat32Frac(b);
2260 bExp = extractFloat32Exp(b);
2261 bSign = extractFloat32Sign(b);
2262 cSig = extractFloat32Frac(c);
2263 cExp = extractFloat32Exp(c);
2264 cSign = extractFloat32Sign(c);
2266 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2267 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2269 /* It is implementation-defined whether the cases of (0,inf,qnan)
2270 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2271 * they return if they do), so we have to hand this information
2272 * off to the target-specific pick-a-NaN routine.
2274 if (((aExp == 0xff) && aSig) ||
2275 ((bExp == 0xff) && bSig) ||
2276 ((cExp == 0xff) && cSig)) {
2277 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2281 float_raise(float_flag_invalid STATUS_VAR);
2282 return float32_default_nan;
2285 if (flags & float_muladd_negate_c) {
2289 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2291 /* Work out the sign and type of the product */
2292 pSign = aSign ^ bSign;
2293 if (flags & float_muladd_negate_product) {
2296 pInf = (aExp == 0xff) || (bExp == 0xff);
2297 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2300 if (pInf && (pSign ^ cSign)) {
2301 /* addition of opposite-signed infinities => InvalidOperation */
2302 float_raise(float_flag_invalid STATUS_VAR);
2303 return float32_default_nan;
2305 /* Otherwise generate an infinity of the same sign */
2306 return packFloat32(cSign ^ signflip, 0xff, 0);
2310 return packFloat32(pSign ^ signflip, 0xff, 0);
2316 /* Adding two exact zeroes */
2317 if (pSign == cSign) {
2319 } else if (STATUS(float_rounding_mode) == float_round_down) {
2324 return packFloat32(zSign ^ signflip, 0, 0);
2326 /* Exact zero plus a denorm */
2327 if (STATUS(flush_to_zero)) {
2328 float_raise(float_flag_output_denormal STATUS_VAR);
2329 return packFloat32(cSign ^ signflip, 0, 0);
2332 /* Zero plus something non-zero : just return the something */
2333 return packFloat32(cSign ^ signflip, cExp, cSig);
2337 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2340 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2343 /* Calculate the actual result a * b + c */
2345 /* Multiply first; this is easy. */
2346 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2347 * because we want the true exponent, not the "one-less-than"
2348 * flavour that roundAndPackFloat32() takes.
2350 pExp = aExp + bExp - 0x7e;
2351 aSig = (aSig | 0x00800000) << 7;
2352 bSig = (bSig | 0x00800000) << 8;
2353 pSig64 = (uint64_t)aSig * bSig;
2354 if ((int64_t)(pSig64 << 1) >= 0) {
2359 zSign = pSign ^ signflip;
2361 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2366 /* Throw out the special case of c being an exact zero now */
2367 shift64RightJamming(pSig64, 32, &pSig64);
2369 return roundAndPackFloat32(zSign, pExp - 1,
2372 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2375 cSig64 = (uint64_t)cSig << (62 - 23);
2376 cSig64 |= LIT64(0x4000000000000000);
2377 expDiff = pExp - cExp;
2379 if (pSign == cSign) {
2382 /* scale c to match p */
2383 shift64RightJamming(cSig64, expDiff, &cSig64);
2385 } else if (expDiff < 0) {
2386 /* scale p to match c */
2387 shift64RightJamming(pSig64, -expDiff, &pSig64);
2390 /* no scaling needed */
2393 /* Add significands and make sure explicit bit ends up in posn 62 */
2394 zSig64 = pSig64 + cSig64;
2395 if ((int64_t)zSig64 < 0) {
2396 shift64RightJamming(zSig64, 1, &zSig64);
2403 shift64RightJamming(cSig64, expDiff, &cSig64);
2404 zSig64 = pSig64 - cSig64;
2406 } else if (expDiff < 0) {
2407 shift64RightJamming(pSig64, -expDiff, &pSig64);
2408 zSig64 = cSig64 - pSig64;
2413 if (cSig64 < pSig64) {
2414 zSig64 = pSig64 - cSig64;
2415 } else if (pSig64 < cSig64) {
2416 zSig64 = cSig64 - pSig64;
2421 if (STATUS(float_rounding_mode) == float_round_down) {
2424 return packFloat32(zSign, 0, 0);
2428 /* Normalize to put the explicit bit back into bit 62. */
2429 shiftcount = countLeadingZeros64(zSig64) - 1;
2430 zSig64 <<= shiftcount;
2433 shift64RightJamming(zSig64, 32, &zSig64);
2434 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2438 /*----------------------------------------------------------------------------
2439 | Returns the square root of the single-precision floating-point value `a'.
2440 | The operation is performed according to the IEC/IEEE Standard for Binary
2441 | Floating-Point Arithmetic.
2442 *----------------------------------------------------------------------------*/
2444 float32 float32_sqrt( float32 a STATUS_PARAM )
2447 int_fast16_t aExp, zExp;
2448 uint32_t aSig, zSig;
2450 a = float32_squash_input_denormal(a STATUS_VAR);
2452 aSig = extractFloat32Frac( a );
2453 aExp = extractFloat32Exp( a );
2454 aSign = extractFloat32Sign( a );
2455 if ( aExp == 0xFF ) {
2456 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2457 if ( ! aSign ) return a;
2458 float_raise( float_flag_invalid STATUS_VAR);
2459 return float32_default_nan;
2462 if ( ( aExp | aSig ) == 0 ) return a;
2463 float_raise( float_flag_invalid STATUS_VAR);
2464 return float32_default_nan;
2467 if ( aSig == 0 ) return float32_zero;
2468 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2470 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2471 aSig = ( aSig | 0x00800000 )<<8;
2472 zSig = estimateSqrt32( aExp, aSig ) + 2;
2473 if ( ( zSig & 0x7F ) <= 5 ) {
2479 term = ( (uint64_t) zSig ) * zSig;
2480 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2481 while ( (int64_t) rem < 0 ) {
2483 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2485 zSig |= ( rem != 0 );
2487 shift32RightJamming( zSig, 1, &zSig );
2489 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2493 /*----------------------------------------------------------------------------
2494 | Returns the binary exponential of the single-precision floating-point value
2495 | `a'. The operation is performed according to the IEC/IEEE Standard for
2496 | Binary Floating-Point Arithmetic.
2498 | Uses the following identities:
2500 | 1. -------------------------------------------------------------------------
2504 | 2. -------------------------------------------------------------------------
2507 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2509 *----------------------------------------------------------------------------*/
2511 static const float64 float32_exp2_coefficients[15] =
2513 const_float64( 0x3ff0000000000000ll ), /* 1 */
2514 const_float64( 0x3fe0000000000000ll ), /* 2 */
2515 const_float64( 0x3fc5555555555555ll ), /* 3 */
2516 const_float64( 0x3fa5555555555555ll ), /* 4 */
2517 const_float64( 0x3f81111111111111ll ), /* 5 */
2518 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2519 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2520 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2521 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2522 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2523 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2524 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2525 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2526 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2527 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2530 float32 float32_exp2( float32 a STATUS_PARAM )
2537 a = float32_squash_input_denormal(a STATUS_VAR);
2539 aSig = extractFloat32Frac( a );
2540 aExp = extractFloat32Exp( a );
2541 aSign = extractFloat32Sign( a );
2543 if ( aExp == 0xFF) {
2544 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2545 return (aSign) ? float32_zero : a;
2548 if (aSig == 0) return float32_one;
2551 float_raise( float_flag_inexact STATUS_VAR);
2553 /* ******************************* */
2554 /* using float64 for approximation */
2555 /* ******************************* */
2556 x = float32_to_float64(a STATUS_VAR);
2557 x = float64_mul(x, float64_ln2 STATUS_VAR);
2561 for (i = 0 ; i < 15 ; i++) {
2564 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2565 r = float64_add(r, f STATUS_VAR);
2567 xn = float64_mul(xn, x STATUS_VAR);
2570 return float64_to_float32(r, status);
2573 /*----------------------------------------------------------------------------
2574 | Returns the binary log of the single-precision floating-point value `a'.
2575 | The operation is performed according to the IEC/IEEE Standard for Binary
2576 | Floating-Point Arithmetic.
2577 *----------------------------------------------------------------------------*/
2578 float32 float32_log2( float32 a STATUS_PARAM )
2582 uint32_t aSig, zSig, i;
2584 a = float32_squash_input_denormal(a STATUS_VAR);
2585 aSig = extractFloat32Frac( a );
2586 aExp = extractFloat32Exp( a );
2587 aSign = extractFloat32Sign( a );
2590 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2591 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2594 float_raise( float_flag_invalid STATUS_VAR);
2595 return float32_default_nan;
2597 if ( aExp == 0xFF ) {
2598 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2607 for (i = 1 << 22; i > 0; i >>= 1) {
2608 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2609 if ( aSig & 0x01000000 ) {
2618 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2621 /*----------------------------------------------------------------------------
2622 | Returns 1 if the single-precision floating-point value `a' is equal to
2623 | the corresponding value `b', and 0 otherwise. The invalid exception is
2624 | raised if either operand is a NaN. Otherwise, the comparison is performed
2625 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2626 *----------------------------------------------------------------------------*/
2628 int float32_eq( float32 a, float32 b STATUS_PARAM )
2631 a = float32_squash_input_denormal(a STATUS_VAR);
2632 b = float32_squash_input_denormal(b STATUS_VAR);
2634 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2635 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2637 float_raise( float_flag_invalid STATUS_VAR);
2640 av = float32_val(a);
2641 bv = float32_val(b);
2642 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2645 /*----------------------------------------------------------------------------
2646 | Returns 1 if the single-precision floating-point value `a' is less than
2647 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2648 | exception is raised if either operand is a NaN. The comparison is performed
2649 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2650 *----------------------------------------------------------------------------*/
2652 int float32_le( float32 a, float32 b STATUS_PARAM )
2656 a = float32_squash_input_denormal(a STATUS_VAR);
2657 b = float32_squash_input_denormal(b STATUS_VAR);
2659 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2660 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2662 float_raise( float_flag_invalid STATUS_VAR);
2665 aSign = extractFloat32Sign( a );
2666 bSign = extractFloat32Sign( b );
2667 av = float32_val(a);
2668 bv = float32_val(b);
2669 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2670 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2674 /*----------------------------------------------------------------------------
2675 | Returns 1 if the single-precision floating-point value `a' is less than
2676 | the corresponding value `b', and 0 otherwise. The invalid exception is
2677 | raised if either operand is a NaN. The comparison is performed according
2678 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2679 *----------------------------------------------------------------------------*/
2681 int float32_lt( float32 a, float32 b STATUS_PARAM )
2685 a = float32_squash_input_denormal(a STATUS_VAR);
2686 b = float32_squash_input_denormal(b STATUS_VAR);
2688 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2689 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2691 float_raise( float_flag_invalid STATUS_VAR);
2694 aSign = extractFloat32Sign( a );
2695 bSign = extractFloat32Sign( b );
2696 av = float32_val(a);
2697 bv = float32_val(b);
2698 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2699 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2703 /*----------------------------------------------------------------------------
2704 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2705 | be compared, and 0 otherwise. The invalid exception is raised if either
2706 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2707 | Standard for Binary Floating-Point Arithmetic.
2708 *----------------------------------------------------------------------------*/
2710 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2712 a = float32_squash_input_denormal(a STATUS_VAR);
2713 b = float32_squash_input_denormal(b STATUS_VAR);
2715 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2716 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2718 float_raise( float_flag_invalid STATUS_VAR);
2724 /*----------------------------------------------------------------------------
2725 | Returns 1 if the single-precision floating-point value `a' is equal to
2726 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2727 | exception. The comparison is performed according to the IEC/IEEE Standard
2728 | for Binary Floating-Point Arithmetic.
2729 *----------------------------------------------------------------------------*/
2731 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2733 a = float32_squash_input_denormal(a STATUS_VAR);
2734 b = float32_squash_input_denormal(b STATUS_VAR);
2736 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2737 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2739 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2740 float_raise( float_flag_invalid STATUS_VAR);
2744 return ( float32_val(a) == float32_val(b) ) ||
2745 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2748 /*----------------------------------------------------------------------------
2749 | Returns 1 if the single-precision floating-point value `a' is less than or
2750 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2751 | cause an exception. Otherwise, the comparison is performed according to the
2752 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2753 *----------------------------------------------------------------------------*/
2755 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2759 a = float32_squash_input_denormal(a STATUS_VAR);
2760 b = float32_squash_input_denormal(b STATUS_VAR);
2762 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2763 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2765 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2766 float_raise( float_flag_invalid STATUS_VAR);
2770 aSign = extractFloat32Sign( a );
2771 bSign = extractFloat32Sign( b );
2772 av = float32_val(a);
2773 bv = float32_val(b);
2774 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2775 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2779 /*----------------------------------------------------------------------------
2780 | Returns 1 if the single-precision floating-point value `a' is less than
2781 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2782 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2783 | Standard for Binary Floating-Point Arithmetic.
2784 *----------------------------------------------------------------------------*/
2786 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2790 a = float32_squash_input_denormal(a STATUS_VAR);
2791 b = float32_squash_input_denormal(b STATUS_VAR);
2793 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2794 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2796 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2797 float_raise( float_flag_invalid STATUS_VAR);
2801 aSign = extractFloat32Sign( a );
2802 bSign = extractFloat32Sign( b );
2803 av = float32_val(a);
2804 bv = float32_val(b);
2805 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2806 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2810 /*----------------------------------------------------------------------------
2811 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2812 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2813 | comparison is performed according to the IEC/IEEE Standard for Binary
2814 | Floating-Point Arithmetic.
2815 *----------------------------------------------------------------------------*/
2817 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2819 a = float32_squash_input_denormal(a STATUS_VAR);
2820 b = float32_squash_input_denormal(b STATUS_VAR);
2822 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2823 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2825 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2826 float_raise( float_flag_invalid STATUS_VAR);
2833 /*----------------------------------------------------------------------------
2834 | Returns the result of converting the double-precision floating-point value
2835 | `a' to the 32-bit two's complement integer format. The conversion is
2836 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2837 | Arithmetic---which means in particular that the conversion is rounded
2838 | according to the current rounding mode. If `a' is a NaN, the largest
2839 | positive integer is returned. Otherwise, if the conversion overflows, the
2840 | largest integer with the same sign as `a' is returned.
2841 *----------------------------------------------------------------------------*/
2843 int32 float64_to_int32( float64 a STATUS_PARAM )
2846 int_fast16_t aExp, shiftCount;
2848 a = float64_squash_input_denormal(a STATUS_VAR);
2850 aSig = extractFloat64Frac( a );
2851 aExp = extractFloat64Exp( a );
2852 aSign = extractFloat64Sign( a );
2853 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2854 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2855 shiftCount = 0x42C - aExp;
2856 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2857 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2861 /*----------------------------------------------------------------------------
2862 | Returns the result of converting the double-precision floating-point value
2863 | `a' to the 32-bit two's complement integer format. The conversion is
2864 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2865 | Arithmetic, except that the conversion is always rounded toward zero.
2866 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2867 | the conversion overflows, the largest integer with the same sign as `a' is
2869 *----------------------------------------------------------------------------*/
2871 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2874 int_fast16_t aExp, shiftCount;
2875 uint64_t aSig, savedASig;
2877 a = float64_squash_input_denormal(a STATUS_VAR);
2879 aSig = extractFloat64Frac( a );
2880 aExp = extractFloat64Exp( a );
2881 aSign = extractFloat64Sign( a );
2882 if ( 0x41E < aExp ) {
2883 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2886 else if ( aExp < 0x3FF ) {
2887 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2890 aSig |= LIT64( 0x0010000000000000 );
2891 shiftCount = 0x433 - aExp;
2893 aSig >>= shiftCount;
2895 if ( aSign ) z = - z;
2896 if ( ( z < 0 ) ^ aSign ) {
2898 float_raise( float_flag_invalid STATUS_VAR);
2899 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2901 if ( ( aSig<<shiftCount ) != savedASig ) {
2902 STATUS(float_exception_flags) |= float_flag_inexact;
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of converting the double-precision floating-point value
2910 | `a' to the 16-bit two's complement integer format. The conversion is
2911 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2912 | Arithmetic, except that the conversion is always rounded toward zero.
2913 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2914 | the conversion overflows, the largest integer with the same sign as `a' is
2916 *----------------------------------------------------------------------------*/
2918 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2921 int_fast16_t aExp, shiftCount;
2922 uint64_t aSig, savedASig;
2925 aSig = extractFloat64Frac( a );
2926 aExp = extractFloat64Exp( a );
2927 aSign = extractFloat64Sign( a );
2928 if ( 0x40E < aExp ) {
2929 if ( ( aExp == 0x7FF ) && aSig ) {
2934 else if ( aExp < 0x3FF ) {
2935 if ( aExp || aSig ) {
2936 STATUS(float_exception_flags) |= float_flag_inexact;
2940 aSig |= LIT64( 0x0010000000000000 );
2941 shiftCount = 0x433 - aExp;
2943 aSig >>= shiftCount;
2948 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2950 float_raise( float_flag_invalid STATUS_VAR);
2951 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2953 if ( ( aSig<<shiftCount ) != savedASig ) {
2954 STATUS(float_exception_flags) |= float_flag_inexact;
2959 /*----------------------------------------------------------------------------
2960 | Returns the result of converting the double-precision floating-point value
2961 | `a' to the 64-bit two's complement integer format. The conversion is
2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2963 | Arithmetic---which means in particular that the conversion is rounded
2964 | according to the current rounding mode. If `a' is a NaN, the largest
2965 | positive integer is returned. Otherwise, if the conversion overflows, the
2966 | largest integer with the same sign as `a' is returned.
2967 *----------------------------------------------------------------------------*/
2969 int64 float64_to_int64( float64 a STATUS_PARAM )
2972 int_fast16_t aExp, shiftCount;
2973 uint64_t aSig, aSigExtra;
2974 a = float64_squash_input_denormal(a STATUS_VAR);
2976 aSig = extractFloat64Frac( a );
2977 aExp = extractFloat64Exp( a );
2978 aSign = extractFloat64Sign( a );
2979 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2980 shiftCount = 0x433 - aExp;
2981 if ( shiftCount <= 0 ) {
2982 if ( 0x43E < aExp ) {
2983 float_raise( float_flag_invalid STATUS_VAR);
2985 || ( ( aExp == 0x7FF )
2986 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2988 return LIT64( 0x7FFFFFFFFFFFFFFF );
2990 return (int64_t) LIT64( 0x8000000000000000 );
2993 aSig <<= - shiftCount;
2996 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2998 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3002 /*----------------------------------------------------------------------------
3003 | Returns the result of converting the double-precision floating-point value
3004 | `a' to the 64-bit two's complement integer format. The conversion is
3005 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3006 | Arithmetic, except that the conversion is always rounded toward zero.
3007 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3008 | the conversion overflows, the largest integer with the same sign as `a' is
3010 *----------------------------------------------------------------------------*/
3012 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3015 int_fast16_t aExp, shiftCount;
3018 a = float64_squash_input_denormal(a STATUS_VAR);
3020 aSig = extractFloat64Frac( a );
3021 aExp = extractFloat64Exp( a );
3022 aSign = extractFloat64Sign( a );
3023 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3024 shiftCount = aExp - 0x433;
3025 if ( 0 <= shiftCount ) {
3026 if ( 0x43E <= aExp ) {
3027 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3028 float_raise( float_flag_invalid STATUS_VAR);
3030 || ( ( aExp == 0x7FF )
3031 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3033 return LIT64( 0x7FFFFFFFFFFFFFFF );
3036 return (int64_t) LIT64( 0x8000000000000000 );
3038 z = aSig<<shiftCount;
3041 if ( aExp < 0x3FE ) {
3042 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3045 z = aSig>>( - shiftCount );
3046 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3047 STATUS(float_exception_flags) |= float_flag_inexact;
3050 if ( aSign ) z = - z;
3055 /*----------------------------------------------------------------------------
3056 | Returns the result of converting the double-precision floating-point value
3057 | `a' to the single-precision floating-point format. The conversion is
3058 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3060 *----------------------------------------------------------------------------*/
3062 float32 float64_to_float32( float64 a STATUS_PARAM )
3068 a = float64_squash_input_denormal(a STATUS_VAR);
3070 aSig = extractFloat64Frac( a );
3071 aExp = extractFloat64Exp( a );
3072 aSign = extractFloat64Sign( a );
3073 if ( aExp == 0x7FF ) {
3074 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3075 return packFloat32( aSign, 0xFF, 0 );
3077 shift64RightJamming( aSig, 22, &aSig );
3079 if ( aExp || zSig ) {
3083 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3088 /*----------------------------------------------------------------------------
3089 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3090 | half-precision floating-point value, returning the result. After being
3091 | shifted into the proper positions, the three fields are simply added
3092 | together to form the result. This means that any integer portion of `zSig'
3093 | will be added into the exponent. Since a properly normalized significand
3094 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3095 | than the desired result exponent whenever `zSig' is a complete, normalized
3097 *----------------------------------------------------------------------------*/
3098 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3100 return make_float16(
3101 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3104 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3105 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3107 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3113 aSign = extractFloat16Sign(a);
3114 aExp = extractFloat16Exp(a);
3115 aSig = extractFloat16Frac(a);
3117 if (aExp == 0x1f && ieee) {
3119 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3121 return packFloat32(aSign, 0xff, 0);
3127 return packFloat32(aSign, 0, 0);
3130 shiftCount = countLeadingZeros32( aSig ) - 21;
3131 aSig = aSig << shiftCount;
3134 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3137 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3145 int maxexp = ieee ? 15 : 16;
3146 bool rounding_bumps_exp;
3147 bool is_tiny = false;
3149 a = float32_squash_input_denormal(a STATUS_VAR);
3151 aSig = extractFloat32Frac( a );
3152 aExp = extractFloat32Exp( a );
3153 aSign = extractFloat32Sign( a );
3154 if ( aExp == 0xFF ) {
3156 /* Input is a NaN */
3158 float_raise(float_flag_invalid STATUS_VAR);
3159 return packFloat16(aSign, 0, 0);
3161 return commonNaNToFloat16(
3162 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3166 float_raise(float_flag_invalid STATUS_VAR);
3167 return packFloat16(aSign, 0x1f, 0x3ff);
3169 return packFloat16(aSign, 0x1f, 0);
3171 if (aExp == 0 && aSig == 0) {
3172 return packFloat16(aSign, 0, 0);
3174 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3175 * even if the input is denormal; however this is harmless because
3176 * the largest possible single-precision denormal is still smaller
3177 * than the smallest representable half-precision denormal, and so we
3178 * will end up ignoring aSig and returning via the "always return zero"
3183 /* Calculate the mask of bits of the mantissa which are not
3184 * representable in half-precision and will be lost.
3187 /* Will be denormal in halfprec */
3193 /* Normal number in halfprec */
3197 roundingMode = STATUS(float_rounding_mode);
3198 switch (roundingMode) {
3199 case float_round_nearest_even:
3200 increment = (mask + 1) >> 1;
3201 if ((aSig & mask) == increment) {
3202 increment = aSig & (increment << 1);
3205 case float_round_up:
3206 increment = aSign ? 0 : mask;
3208 case float_round_down:
3209 increment = aSign ? mask : 0;
3211 default: /* round_to_zero */
3216 rounding_bumps_exp = (aSig + increment >= 0x01000000);
3218 if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3220 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3221 return packFloat16(aSign, 0x1f, 0);
3223 float_raise(float_flag_invalid STATUS_VAR);
3224 return packFloat16(aSign, 0x1f, 0x3ff);
3229 /* Note that flush-to-zero does not affect half-precision results */
3231 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3233 || (!rounding_bumps_exp);
3236 float_raise(float_flag_inexact STATUS_VAR);
3238 float_raise(float_flag_underflow STATUS_VAR);
3243 if (rounding_bumps_exp) {
3249 return packFloat16(aSign, 0, 0);
3252 aSig >>= -14 - aExp;
3255 return packFloat16(aSign, aExp + 14, aSig >> 13);
3258 /*----------------------------------------------------------------------------
3259 | Returns the result of converting the double-precision floating-point value
3260 | `a' to the extended double-precision floating-point format. The conversion
3261 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3263 *----------------------------------------------------------------------------*/
3265 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3271 a = float64_squash_input_denormal(a STATUS_VAR);
3272 aSig = extractFloat64Frac( a );
3273 aExp = extractFloat64Exp( a );
3274 aSign = extractFloat64Sign( a );
3275 if ( aExp == 0x7FF ) {
3276 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3277 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3280 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3281 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3285 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3289 /*----------------------------------------------------------------------------
3290 | Returns the result of converting the double-precision floating-point value
3291 | `a' to the quadruple-precision floating-point format. The conversion is
3292 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3294 *----------------------------------------------------------------------------*/
3296 float128 float64_to_float128( float64 a STATUS_PARAM )
3300 uint64_t aSig, zSig0, zSig1;
3302 a = float64_squash_input_denormal(a STATUS_VAR);
3303 aSig = extractFloat64Frac( a );
3304 aExp = extractFloat64Exp( a );
3305 aSign = extractFloat64Sign( a );
3306 if ( aExp == 0x7FF ) {
3307 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3308 return packFloat128( aSign, 0x7FFF, 0, 0 );
3311 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3312 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3315 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3316 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3320 /*----------------------------------------------------------------------------
3321 | Rounds the double-precision floating-point value `a' to an integer, and
3322 | returns the result as a double-precision floating-point value. The
3323 | operation is performed according to the IEC/IEEE Standard for Binary
3324 | Floating-Point Arithmetic.
3325 *----------------------------------------------------------------------------*/
3327 float64 float64_round_to_int( float64 a STATUS_PARAM )
3331 uint64_t lastBitMask, roundBitsMask;
3334 a = float64_squash_input_denormal(a STATUS_VAR);
3336 aExp = extractFloat64Exp( a );
3337 if ( 0x433 <= aExp ) {
3338 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3339 return propagateFloat64NaN( a, a STATUS_VAR );
3343 if ( aExp < 0x3FF ) {
3344 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3345 STATUS(float_exception_flags) |= float_flag_inexact;
3346 aSign = extractFloat64Sign( a );
3347 switch ( STATUS(float_rounding_mode) ) {
3348 case float_round_nearest_even:
3349 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3350 return packFloat64( aSign, 0x3FF, 0 );
3353 case float_round_down:
3354 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3355 case float_round_up:
3356 return make_float64(
3357 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3359 return packFloat64( aSign, 0, 0 );
3362 lastBitMask <<= 0x433 - aExp;
3363 roundBitsMask = lastBitMask - 1;
3365 roundingMode = STATUS(float_rounding_mode);
3366 if ( roundingMode == float_round_nearest_even ) {
3367 z += lastBitMask>>1;
3368 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3370 else if ( roundingMode != float_round_to_zero ) {
3371 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3375 z &= ~ roundBitsMask;
3376 if ( z != float64_val(a) )
3377 STATUS(float_exception_flags) |= float_flag_inexact;
3378 return make_float64(z);
3382 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3386 oldmode = STATUS(float_rounding_mode);
3387 STATUS(float_rounding_mode) = float_round_to_zero;
3388 res = float64_round_to_int(a STATUS_VAR);
3389 STATUS(float_rounding_mode) = oldmode;
3393 /*----------------------------------------------------------------------------
3394 | Returns the result of adding the absolute values of the double-precision
3395 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3396 | before being returned. `zSign' is ignored if the result is a NaN.
3397 | The addition is performed according to the IEC/IEEE Standard for Binary
3398 | Floating-Point Arithmetic.
3399 *----------------------------------------------------------------------------*/
3401 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3403 int_fast16_t aExp, bExp, zExp;
3404 uint64_t aSig, bSig, zSig;
3405 int_fast16_t expDiff;
3407 aSig = extractFloat64Frac( a );
3408 aExp = extractFloat64Exp( a );
3409 bSig = extractFloat64Frac( b );
3410 bExp = extractFloat64Exp( b );
3411 expDiff = aExp - bExp;
3414 if ( 0 < expDiff ) {
3415 if ( aExp == 0x7FF ) {
3416 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3423 bSig |= LIT64( 0x2000000000000000 );
3425 shift64RightJamming( bSig, expDiff, &bSig );
3428 else if ( expDiff < 0 ) {
3429 if ( bExp == 0x7FF ) {
3430 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3431 return packFloat64( zSign, 0x7FF, 0 );
3437 aSig |= LIT64( 0x2000000000000000 );
3439 shift64RightJamming( aSig, - expDiff, &aSig );
3443 if ( aExp == 0x7FF ) {
3444 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3448 if (STATUS(flush_to_zero)) {
3450 float_raise(float_flag_output_denormal STATUS_VAR);
3452 return packFloat64(zSign, 0, 0);
3454 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3456 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3460 aSig |= LIT64( 0x2000000000000000 );
3461 zSig = ( aSig + bSig )<<1;
3463 if ( (int64_t) zSig < 0 ) {
3468 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3472 /*----------------------------------------------------------------------------
3473 | Returns the result of subtracting the absolute values of the double-
3474 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3475 | difference is negated before being returned. `zSign' is ignored if the
3476 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3477 | Standard for Binary Floating-Point Arithmetic.
3478 *----------------------------------------------------------------------------*/
3480 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3482 int_fast16_t aExp, bExp, zExp;
3483 uint64_t aSig, bSig, zSig;
3484 int_fast16_t expDiff;
3486 aSig = extractFloat64Frac( a );
3487 aExp = extractFloat64Exp( a );
3488 bSig = extractFloat64Frac( b );
3489 bExp = extractFloat64Exp( b );
3490 expDiff = aExp - bExp;
3493 if ( 0 < expDiff ) goto aExpBigger;
3494 if ( expDiff < 0 ) goto bExpBigger;
3495 if ( aExp == 0x7FF ) {
3496 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3497 float_raise( float_flag_invalid STATUS_VAR);
3498 return float64_default_nan;
3504 if ( bSig < aSig ) goto aBigger;
3505 if ( aSig < bSig ) goto bBigger;
3506 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3508 if ( bExp == 0x7FF ) {
3509 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3510 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3516 aSig |= LIT64( 0x4000000000000000 );
3518 shift64RightJamming( aSig, - expDiff, &aSig );
3519 bSig |= LIT64( 0x4000000000000000 );
3524 goto normalizeRoundAndPack;
3526 if ( aExp == 0x7FF ) {
3527 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3534 bSig |= LIT64( 0x4000000000000000 );
3536 shift64RightJamming( bSig, expDiff, &bSig );
3537 aSig |= LIT64( 0x4000000000000000 );
3541 normalizeRoundAndPack:
3543 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3547 /*----------------------------------------------------------------------------
3548 | Returns the result of adding the double-precision floating-point values `a'
3549 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3550 | Binary Floating-Point Arithmetic.
3551 *----------------------------------------------------------------------------*/
3553 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3556 a = float64_squash_input_denormal(a STATUS_VAR);
3557 b = float64_squash_input_denormal(b STATUS_VAR);
3559 aSign = extractFloat64Sign( a );
3560 bSign = extractFloat64Sign( b );
3561 if ( aSign == bSign ) {
3562 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3565 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3570 /*----------------------------------------------------------------------------
3571 | Returns the result of subtracting the double-precision floating-point values
3572 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3573 | for Binary Floating-Point Arithmetic.
3574 *----------------------------------------------------------------------------*/
3576 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3579 a = float64_squash_input_denormal(a STATUS_VAR);
3580 b = float64_squash_input_denormal(b STATUS_VAR);
3582 aSign = extractFloat64Sign( a );
3583 bSign = extractFloat64Sign( b );
3584 if ( aSign == bSign ) {
3585 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3588 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3593 /*----------------------------------------------------------------------------
3594 | Returns the result of multiplying the double-precision floating-point values
3595 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3596 | for Binary Floating-Point Arithmetic.
3597 *----------------------------------------------------------------------------*/
3599 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3601 flag aSign, bSign, zSign;
3602 int_fast16_t aExp, bExp, zExp;
3603 uint64_t aSig, bSig, zSig0, zSig1;
3605 a = float64_squash_input_denormal(a STATUS_VAR);
3606 b = float64_squash_input_denormal(b STATUS_VAR);
3608 aSig = extractFloat64Frac( a );
3609 aExp = extractFloat64Exp( a );
3610 aSign = extractFloat64Sign( a );
3611 bSig = extractFloat64Frac( b );
3612 bExp = extractFloat64Exp( b );
3613 bSign = extractFloat64Sign( b );
3614 zSign = aSign ^ bSign;
3615 if ( aExp == 0x7FF ) {
3616 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3617 return propagateFloat64NaN( a, b STATUS_VAR );
3619 if ( ( bExp | bSig ) == 0 ) {
3620 float_raise( float_flag_invalid STATUS_VAR);
3621 return float64_default_nan;
3623 return packFloat64( zSign, 0x7FF, 0 );
3625 if ( bExp == 0x7FF ) {
3626 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3627 if ( ( aExp | aSig ) == 0 ) {
3628 float_raise( float_flag_invalid STATUS_VAR);
3629 return float64_default_nan;
3631 return packFloat64( zSign, 0x7FF, 0 );
3634 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3635 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3638 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3639 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3641 zExp = aExp + bExp - 0x3FF;
3642 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3643 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3644 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3645 zSig0 |= ( zSig1 != 0 );
3646 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3650 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3654 /*----------------------------------------------------------------------------
3655 | Returns the result of dividing the double-precision floating-point value `a'
3656 | by the corresponding value `b'. The operation is performed according to
3657 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3658 *----------------------------------------------------------------------------*/
3660 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3662 flag aSign, bSign, zSign;
3663 int_fast16_t aExp, bExp, zExp;
3664 uint64_t aSig, bSig, zSig;
3665 uint64_t rem0, rem1;
3666 uint64_t term0, term1;
3667 a = float64_squash_input_denormal(a STATUS_VAR);
3668 b = float64_squash_input_denormal(b STATUS_VAR);
3670 aSig = extractFloat64Frac( a );
3671 aExp = extractFloat64Exp( a );
3672 aSign = extractFloat64Sign( a );
3673 bSig = extractFloat64Frac( b );
3674 bExp = extractFloat64Exp( b );
3675 bSign = extractFloat64Sign( b );
3676 zSign = aSign ^ bSign;
3677 if ( aExp == 0x7FF ) {
3678 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3679 if ( bExp == 0x7FF ) {
3680 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3681 float_raise( float_flag_invalid STATUS_VAR);
3682 return float64_default_nan;
3684 return packFloat64( zSign, 0x7FF, 0 );
3686 if ( bExp == 0x7FF ) {
3687 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3688 return packFloat64( zSign, 0, 0 );
3692 if ( ( aExp | aSig ) == 0 ) {
3693 float_raise( float_flag_invalid STATUS_VAR);
3694 return float64_default_nan;
3696 float_raise( float_flag_divbyzero STATUS_VAR);
3697 return packFloat64( zSign, 0x7FF, 0 );
3699 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3702 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3703 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3705 zExp = aExp - bExp + 0x3FD;
3706 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3707 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3708 if ( bSig <= ( aSig + aSig ) ) {
3712 zSig = estimateDiv128To64( aSig, 0, bSig );
3713 if ( ( zSig & 0x1FF ) <= 2 ) {
3714 mul64To128( bSig, zSig, &term0, &term1 );
3715 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3716 while ( (int64_t) rem0 < 0 ) {
3718 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3720 zSig |= ( rem1 != 0 );
3722 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3726 /*----------------------------------------------------------------------------
3727 | Returns the remainder of the double-precision floating-point value `a'
3728 | with respect to the corresponding value `b'. The operation is performed
3729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3730 *----------------------------------------------------------------------------*/
3732 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3735 int_fast16_t aExp, bExp, expDiff;
3736 uint64_t aSig, bSig;
3737 uint64_t q, alternateASig;
3740 a = float64_squash_input_denormal(a STATUS_VAR);
3741 b = float64_squash_input_denormal(b STATUS_VAR);
3742 aSig = extractFloat64Frac( a );
3743 aExp = extractFloat64Exp( a );
3744 aSign = extractFloat64Sign( a );
3745 bSig = extractFloat64Frac( b );
3746 bExp = extractFloat64Exp( b );
3747 if ( aExp == 0x7FF ) {
3748 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3749 return propagateFloat64NaN( a, b STATUS_VAR );
3751 float_raise( float_flag_invalid STATUS_VAR);
3752 return float64_default_nan;
3754 if ( bExp == 0x7FF ) {
3755 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3760 float_raise( float_flag_invalid STATUS_VAR);
3761 return float64_default_nan;
3763 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3766 if ( aSig == 0 ) return a;
3767 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3769 expDiff = aExp - bExp;
3770 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3771 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3772 if ( expDiff < 0 ) {
3773 if ( expDiff < -1 ) return a;
3776 q = ( bSig <= aSig );
3777 if ( q ) aSig -= bSig;
3779 while ( 0 < expDiff ) {
3780 q = estimateDiv128To64( aSig, 0, bSig );
3781 q = ( 2 < q ) ? q - 2 : 0;
3782 aSig = - ( ( bSig>>2 ) * q );
3786 if ( 0 < expDiff ) {
3787 q = estimateDiv128To64( aSig, 0, bSig );
3788 q = ( 2 < q ) ? q - 2 : 0;
3791 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3798 alternateASig = aSig;
3801 } while ( 0 <= (int64_t) aSig );
3802 sigMean = aSig + alternateASig;
3803 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3804 aSig = alternateASig;
3806 zSign = ( (int64_t) aSig < 0 );
3807 if ( zSign ) aSig = - aSig;
3808 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3812 /*----------------------------------------------------------------------------
3813 | Returns the result of multiplying the double-precision floating-point values
3814 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3815 | multiplication. The operation is performed according to the IEC/IEEE
3816 | Standard for Binary Floating-Point Arithmetic 754-2008.
3817 | The flags argument allows the caller to select negation of the
3818 | addend, the intermediate product, or the final result. (The difference
3819 | between this and having the caller do a separate negation is that negating
3820 | externally will flip the sign bit on NaNs.)
3821 *----------------------------------------------------------------------------*/
3823 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3825 flag aSign, bSign, cSign, zSign;
3826 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3827 uint64_t aSig, bSig, cSig;
3828 flag pInf, pZero, pSign;
3829 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3831 flag signflip, infzero;
3833 a = float64_squash_input_denormal(a STATUS_VAR);
3834 b = float64_squash_input_denormal(b STATUS_VAR);
3835 c = float64_squash_input_denormal(c STATUS_VAR);
3836 aSig = extractFloat64Frac(a);
3837 aExp = extractFloat64Exp(a);
3838 aSign = extractFloat64Sign(a);
3839 bSig = extractFloat64Frac(b);
3840 bExp = extractFloat64Exp(b);
3841 bSign = extractFloat64Sign(b);
3842 cSig = extractFloat64Frac(c);
3843 cExp = extractFloat64Exp(c);
3844 cSign = extractFloat64Sign(c);
3846 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3847 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3849 /* It is implementation-defined whether the cases of (0,inf,qnan)
3850 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3851 * they return if they do), so we have to hand this information
3852 * off to the target-specific pick-a-NaN routine.
3854 if (((aExp == 0x7ff) && aSig) ||
3855 ((bExp == 0x7ff) && bSig) ||
3856 ((cExp == 0x7ff) && cSig)) {
3857 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3861 float_raise(float_flag_invalid STATUS_VAR);
3862 return float64_default_nan;
3865 if (flags & float_muladd_negate_c) {
3869 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3871 /* Work out the sign and type of the product */
3872 pSign = aSign ^ bSign;
3873 if (flags & float_muladd_negate_product) {
3876 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3877 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3879 if (cExp == 0x7ff) {
3880 if (pInf && (pSign ^ cSign)) {
3881 /* addition of opposite-signed infinities => InvalidOperation */
3882 float_raise(float_flag_invalid STATUS_VAR);
3883 return float64_default_nan;
3885 /* Otherwise generate an infinity of the same sign */
3886 return packFloat64(cSign ^ signflip, 0x7ff, 0);
3890 return packFloat64(pSign ^ signflip, 0x7ff, 0);
3896 /* Adding two exact zeroes */
3897 if (pSign == cSign) {
3899 } else if (STATUS(float_rounding_mode) == float_round_down) {
3904 return packFloat64(zSign ^ signflip, 0, 0);
3906 /* Exact zero plus a denorm */
3907 if (STATUS(flush_to_zero)) {
3908 float_raise(float_flag_output_denormal STATUS_VAR);
3909 return packFloat64(cSign ^ signflip, 0, 0);
3912 /* Zero plus something non-zero : just return the something */
3913 return packFloat64(cSign ^ signflip, cExp, cSig);
3917 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3920 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3923 /* Calculate the actual result a * b + c */
3925 /* Multiply first; this is easy. */
3926 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3927 * because we want the true exponent, not the "one-less-than"
3928 * flavour that roundAndPackFloat64() takes.
3930 pExp = aExp + bExp - 0x3fe;
3931 aSig = (aSig | LIT64(0x0010000000000000))<<10;
3932 bSig = (bSig | LIT64(0x0010000000000000))<<11;
3933 mul64To128(aSig, bSig, &pSig0, &pSig1);
3934 if ((int64_t)(pSig0 << 1) >= 0) {
3935 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3939 zSign = pSign ^ signflip;
3941 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3942 * bit in position 126.
3946 /* Throw out the special case of c being an exact zero now */
3947 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3948 return roundAndPackFloat64(zSign, pExp - 1,
3951 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3954 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3955 * significand of the addend, with the explicit bit in position 126.
3957 cSig0 = cSig << (126 - 64 - 52);
3959 cSig0 |= LIT64(0x4000000000000000);
3960 expDiff = pExp - cExp;
3962 if (pSign == cSign) {
3965 /* scale c to match p */
3966 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3968 } else if (expDiff < 0) {
3969 /* scale p to match c */
3970 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3973 /* no scaling needed */
3976 /* Add significands and make sure explicit bit ends up in posn 126 */
3977 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3978 if ((int64_t)zSig0 < 0) {
3979 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3983 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3984 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3988 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3989 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3991 } else if (expDiff < 0) {
3992 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3993 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3998 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3999 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4000 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4001 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4006 if (STATUS(float_rounding_mode) == float_round_down) {
4009 return packFloat64(zSign, 0, 0);
4013 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4014 * starting with the significand in a pair of uint64_t.
4017 shiftcount = countLeadingZeros64(zSig0) - 1;
4018 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4024 shiftcount = countLeadingZeros64(zSig1);
4025 if (shiftcount == 0) {
4026 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4030 zSig0 = zSig1 << shiftcount;
4031 zExp -= (shiftcount + 64);
4034 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4038 /*----------------------------------------------------------------------------
4039 | Returns the square root of the double-precision floating-point value `a'.
4040 | The operation is performed according to the IEC/IEEE Standard for Binary
4041 | Floating-Point Arithmetic.
4042 *----------------------------------------------------------------------------*/
4044 float64 float64_sqrt( float64 a STATUS_PARAM )
4047 int_fast16_t aExp, zExp;
4048 uint64_t aSig, zSig, doubleZSig;
4049 uint64_t rem0, rem1, term0, term1;
4050 a = float64_squash_input_denormal(a STATUS_VAR);
4052 aSig = extractFloat64Frac( a );
4053 aExp = extractFloat64Exp( a );
4054 aSign = extractFloat64Sign( a );
4055 if ( aExp == 0x7FF ) {
4056 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4057 if ( ! aSign ) return a;
4058 float_raise( float_flag_invalid STATUS_VAR);
4059 return float64_default_nan;
4062 if ( ( aExp | aSig ) == 0 ) return a;
4063 float_raise( float_flag_invalid STATUS_VAR);
4064 return float64_default_nan;
4067 if ( aSig == 0 ) return float64_zero;
4068 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4070 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4071 aSig |= LIT64( 0x0010000000000000 );
4072 zSig = estimateSqrt32( aExp, aSig>>21 );
4073 aSig <<= 9 - ( aExp & 1 );
4074 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4075 if ( ( zSig & 0x1FF ) <= 5 ) {
4076 doubleZSig = zSig<<1;
4077 mul64To128( zSig, zSig, &term0, &term1 );
4078 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4079 while ( (int64_t) rem0 < 0 ) {
4082 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4084 zSig |= ( ( rem0 | rem1 ) != 0 );
4086 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4090 /*----------------------------------------------------------------------------
4091 | Returns the binary log of the double-precision floating-point value `a'.
4092 | The operation is performed according to the IEC/IEEE Standard for Binary
4093 | Floating-Point Arithmetic.
4094 *----------------------------------------------------------------------------*/
4095 float64 float64_log2( float64 a STATUS_PARAM )
4099 uint64_t aSig, aSig0, aSig1, zSig, i;
4100 a = float64_squash_input_denormal(a STATUS_VAR);
4102 aSig = extractFloat64Frac( a );
4103 aExp = extractFloat64Exp( a );
4104 aSign = extractFloat64Sign( a );
4107 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4108 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4111 float_raise( float_flag_invalid STATUS_VAR);
4112 return float64_default_nan;
4114 if ( aExp == 0x7FF ) {
4115 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4120 aSig |= LIT64( 0x0010000000000000 );
4122 zSig = (uint64_t)aExp << 52;
4123 for (i = 1LL << 51; i > 0; i >>= 1) {
4124 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4125 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4126 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4134 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4137 /*----------------------------------------------------------------------------
4138 | Returns 1 if the double-precision floating-point value `a' is equal to the
4139 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4140 | if either operand is a NaN. Otherwise, the comparison is performed
4141 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4142 *----------------------------------------------------------------------------*/
4144 int float64_eq( float64 a, float64 b STATUS_PARAM )
4147 a = float64_squash_input_denormal(a STATUS_VAR);
4148 b = float64_squash_input_denormal(b STATUS_VAR);
4150 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4151 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4153 float_raise( float_flag_invalid STATUS_VAR);
4156 av = float64_val(a);
4157 bv = float64_val(b);
4158 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4162 /*----------------------------------------------------------------------------
4163 | Returns 1 if the double-precision floating-point value `a' is less than or
4164 | equal to the corresponding value `b', and 0 otherwise. The invalid
4165 | exception is raised if either operand is a NaN. The comparison is performed
4166 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4167 *----------------------------------------------------------------------------*/
4169 int float64_le( float64 a, float64 b STATUS_PARAM )
4173 a = float64_squash_input_denormal(a STATUS_VAR);
4174 b = float64_squash_input_denormal(b STATUS_VAR);
4176 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4177 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4179 float_raise( float_flag_invalid STATUS_VAR);
4182 aSign = extractFloat64Sign( a );
4183 bSign = extractFloat64Sign( b );
4184 av = float64_val(a);
4185 bv = float64_val(b);
4186 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4187 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4191 /*----------------------------------------------------------------------------
4192 | Returns 1 if the double-precision floating-point value `a' is less than
4193 | the corresponding value `b', and 0 otherwise. The invalid exception is
4194 | raised if either operand is a NaN. The comparison is performed according
4195 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4196 *----------------------------------------------------------------------------*/
4198 int float64_lt( float64 a, float64 b STATUS_PARAM )
4203 a = float64_squash_input_denormal(a STATUS_VAR);
4204 b = float64_squash_input_denormal(b STATUS_VAR);
4205 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4206 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4208 float_raise( float_flag_invalid STATUS_VAR);
4211 aSign = extractFloat64Sign( a );
4212 bSign = extractFloat64Sign( b );
4213 av = float64_val(a);
4214 bv = float64_val(b);
4215 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4216 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4220 /*----------------------------------------------------------------------------
4221 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4222 | be compared, and 0 otherwise. The invalid exception is raised if either
4223 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4224 | Standard for Binary Floating-Point Arithmetic.
4225 *----------------------------------------------------------------------------*/
4227 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4229 a = float64_squash_input_denormal(a STATUS_VAR);
4230 b = float64_squash_input_denormal(b STATUS_VAR);
4232 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4233 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4235 float_raise( float_flag_invalid STATUS_VAR);
4241 /*----------------------------------------------------------------------------
4242 | Returns 1 if the double-precision floating-point value `a' is equal to the
4243 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4244 | exception.The comparison is performed according to the IEC/IEEE Standard
4245 | for Binary Floating-Point Arithmetic.
4246 *----------------------------------------------------------------------------*/
4248 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4251 a = float64_squash_input_denormal(a STATUS_VAR);
4252 b = float64_squash_input_denormal(b STATUS_VAR);
4254 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4255 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4257 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4258 float_raise( float_flag_invalid STATUS_VAR);
4262 av = float64_val(a);
4263 bv = float64_val(b);
4264 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4268 /*----------------------------------------------------------------------------
4269 | Returns 1 if the double-precision floating-point value `a' is less than or
4270 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4271 | cause an exception. Otherwise, the comparison is performed according to the
4272 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273 *----------------------------------------------------------------------------*/
4275 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4279 a = float64_squash_input_denormal(a STATUS_VAR);
4280 b = float64_squash_input_denormal(b STATUS_VAR);
4282 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4283 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4285 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4286 float_raise( float_flag_invalid STATUS_VAR);
4290 aSign = extractFloat64Sign( a );
4291 bSign = extractFloat64Sign( b );
4292 av = float64_val(a);
4293 bv = float64_val(b);
4294 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4295 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4299 /*----------------------------------------------------------------------------
4300 | Returns 1 if the double-precision floating-point value `a' is less than
4301 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4302 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4303 | Standard for Binary Floating-Point Arithmetic.
4304 *----------------------------------------------------------------------------*/
4306 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4310 a = float64_squash_input_denormal(a STATUS_VAR);
4311 b = float64_squash_input_denormal(b STATUS_VAR);
4313 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4314 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4316 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4317 float_raise( float_flag_invalid STATUS_VAR);
4321 aSign = extractFloat64Sign( a );
4322 bSign = extractFloat64Sign( b );
4323 av = float64_val(a);
4324 bv = float64_val(b);
4325 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4326 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4330 /*----------------------------------------------------------------------------
4331 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4332 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4333 | comparison is performed according to the IEC/IEEE Standard for Binary
4334 | Floating-Point Arithmetic.
4335 *----------------------------------------------------------------------------*/
4337 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4339 a = float64_squash_input_denormal(a STATUS_VAR);
4340 b = float64_squash_input_denormal(b STATUS_VAR);
4342 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4343 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4345 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4346 float_raise( float_flag_invalid STATUS_VAR);
4353 /*----------------------------------------------------------------------------
4354 | Returns the result of converting the extended double-precision floating-
4355 | point value `a' to the 32-bit two's complement integer format. The
4356 | conversion is performed according to the IEC/IEEE Standard for Binary
4357 | Floating-Point Arithmetic---which means in particular that the conversion
4358 | is rounded according to the current rounding mode. If `a' is a NaN, the
4359 | largest positive integer is returned. Otherwise, if the conversion
4360 | overflows, the largest integer with the same sign as `a' is returned.
4361 *----------------------------------------------------------------------------*/
4363 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4366 int32 aExp, shiftCount;
4369 aSig = extractFloatx80Frac( a );
4370 aExp = extractFloatx80Exp( a );
4371 aSign = extractFloatx80Sign( a );
4372 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4373 shiftCount = 0x4037 - aExp;
4374 if ( shiftCount <= 0 ) shiftCount = 1;
4375 shift64RightJamming( aSig, shiftCount, &aSig );
4376 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4380 /*----------------------------------------------------------------------------
4381 | Returns the result of converting the extended double-precision floating-
4382 | point value `a' to the 32-bit two's complement integer format. The
4383 | conversion is performed according to the IEC/IEEE Standard for Binary
4384 | Floating-Point Arithmetic, except that the conversion is always rounded
4385 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4386 | Otherwise, if the conversion overflows, the largest integer with the same
4387 | sign as `a' is returned.
4388 *----------------------------------------------------------------------------*/
4390 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4393 int32 aExp, shiftCount;
4394 uint64_t aSig, savedASig;
4397 aSig = extractFloatx80Frac( a );
4398 aExp = extractFloatx80Exp( a );
4399 aSign = extractFloatx80Sign( a );
4400 if ( 0x401E < aExp ) {
4401 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4404 else if ( aExp < 0x3FFF ) {
4405 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4408 shiftCount = 0x403E - aExp;
4410 aSig >>= shiftCount;
4412 if ( aSign ) z = - z;
4413 if ( ( z < 0 ) ^ aSign ) {
4415 float_raise( float_flag_invalid STATUS_VAR);
4416 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4418 if ( ( aSig<<shiftCount ) != savedASig ) {
4419 STATUS(float_exception_flags) |= float_flag_inexact;
4425 /*----------------------------------------------------------------------------
4426 | Returns the result of converting the extended double-precision floating-
4427 | point value `a' to the 64-bit two's complement integer format. The
4428 | conversion is performed according to the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic---which means in particular that the conversion
4430 | is rounded according to the current rounding mode. If `a' is a NaN,
4431 | the largest positive integer is returned. Otherwise, if the conversion
4432 | overflows, the largest integer with the same sign as `a' is returned.
4433 *----------------------------------------------------------------------------*/
4435 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4438 int32 aExp, shiftCount;
4439 uint64_t aSig, aSigExtra;
4441 aSig = extractFloatx80Frac( a );
4442 aExp = extractFloatx80Exp( a );
4443 aSign = extractFloatx80Sign( a );
4444 shiftCount = 0x403E - aExp;
4445 if ( shiftCount <= 0 ) {
4447 float_raise( float_flag_invalid STATUS_VAR);
4449 || ( ( aExp == 0x7FFF )
4450 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4452 return LIT64( 0x7FFFFFFFFFFFFFFF );
4454 return (int64_t) LIT64( 0x8000000000000000 );
4459 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4461 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4465 /*----------------------------------------------------------------------------
4466 | Returns the result of converting the extended double-precision floating-
4467 | point value `a' to the 64-bit two's complement integer format. The
4468 | conversion is performed according to the IEC/IEEE Standard for Binary
4469 | Floating-Point Arithmetic, except that the conversion is always rounded
4470 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4471 | Otherwise, if the conversion overflows, the largest integer with the same
4472 | sign as `a' is returned.
4473 *----------------------------------------------------------------------------*/
4475 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4478 int32 aExp, shiftCount;
4482 aSig = extractFloatx80Frac( a );
4483 aExp = extractFloatx80Exp( a );
4484 aSign = extractFloatx80Sign( a );
4485 shiftCount = aExp - 0x403E;
4486 if ( 0 <= shiftCount ) {
4487 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4488 if ( ( a.high != 0xC03E ) || aSig ) {
4489 float_raise( float_flag_invalid STATUS_VAR);
4490 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4491 return LIT64( 0x7FFFFFFFFFFFFFFF );
4494 return (int64_t) LIT64( 0x8000000000000000 );
4496 else if ( aExp < 0x3FFF ) {
4497 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4500 z = aSig>>( - shiftCount );
4501 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4502 STATUS(float_exception_flags) |= float_flag_inexact;
4504 if ( aSign ) z = - z;
4509 /*----------------------------------------------------------------------------
4510 | Returns the result of converting the extended double-precision floating-
4511 | point value `a' to the single-precision floating-point format. The
4512 | conversion is performed according to the IEC/IEEE Standard for Binary
4513 | Floating-Point Arithmetic.
4514 *----------------------------------------------------------------------------*/
4516 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4522 aSig = extractFloatx80Frac( a );
4523 aExp = extractFloatx80Exp( a );
4524 aSign = extractFloatx80Sign( a );
4525 if ( aExp == 0x7FFF ) {
4526 if ( (uint64_t) ( aSig<<1 ) ) {
4527 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4529 return packFloat32( aSign, 0xFF, 0 );
4531 shift64RightJamming( aSig, 33, &aSig );
4532 if ( aExp || aSig ) aExp -= 0x3F81;
4533 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4537 /*----------------------------------------------------------------------------
4538 | Returns the result of converting the extended double-precision floating-
4539 | point value `a' to the double-precision floating-point format. The
4540 | conversion is performed according to the IEC/IEEE Standard for Binary
4541 | Floating-Point Arithmetic.
4542 *----------------------------------------------------------------------------*/
4544 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4548 uint64_t aSig, zSig;
4550 aSig = extractFloatx80Frac( a );
4551 aExp = extractFloatx80Exp( a );
4552 aSign = extractFloatx80Sign( a );
4553 if ( aExp == 0x7FFF ) {
4554 if ( (uint64_t) ( aSig<<1 ) ) {
4555 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4557 return packFloat64( aSign, 0x7FF, 0 );
4559 shift64RightJamming( aSig, 1, &zSig );
4560 if ( aExp || aSig ) aExp -= 0x3C01;
4561 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4565 /*----------------------------------------------------------------------------
4566 | Returns the result of converting the extended double-precision floating-
4567 | point value `a' to the quadruple-precision floating-point format. The
4568 | conversion is performed according to the IEC/IEEE Standard for Binary
4569 | Floating-Point Arithmetic.
4570 *----------------------------------------------------------------------------*/
4572 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4576 uint64_t aSig, zSig0, zSig1;
4578 aSig = extractFloatx80Frac( a );
4579 aExp = extractFloatx80Exp( a );
4580 aSign = extractFloatx80Sign( a );
4581 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4582 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4584 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4585 return packFloat128( aSign, aExp, zSig0, zSig1 );
4589 /*----------------------------------------------------------------------------
4590 | Rounds the extended double-precision floating-point value `a' to an integer,
4591 | and returns the result as an extended quadruple-precision floating-point
4592 | value. The operation is performed according to the IEC/IEEE Standard for
4593 | Binary Floating-Point Arithmetic.
4594 *----------------------------------------------------------------------------*/
4596 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4600 uint64_t lastBitMask, roundBitsMask;
4604 aExp = extractFloatx80Exp( a );
4605 if ( 0x403E <= aExp ) {
4606 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4607 return propagateFloatx80NaN( a, a STATUS_VAR );
4611 if ( aExp < 0x3FFF ) {
4613 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4616 STATUS(float_exception_flags) |= float_flag_inexact;
4617 aSign = extractFloatx80Sign( a );
4618 switch ( STATUS(float_rounding_mode) ) {
4619 case float_round_nearest_even:
4620 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4623 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4626 case float_round_down:
4629 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4630 : packFloatx80( 0, 0, 0 );
4631 case float_round_up:
4633 aSign ? packFloatx80( 1, 0, 0 )
4634 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4636 return packFloatx80( aSign, 0, 0 );
4639 lastBitMask <<= 0x403E - aExp;
4640 roundBitsMask = lastBitMask - 1;
4642 roundingMode = STATUS(float_rounding_mode);
4643 if ( roundingMode == float_round_nearest_even ) {
4644 z.low += lastBitMask>>1;
4645 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4647 else if ( roundingMode != float_round_to_zero ) {
4648 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4649 z.low += roundBitsMask;
4652 z.low &= ~ roundBitsMask;
4655 z.low = LIT64( 0x8000000000000000 );
4657 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4662 /*----------------------------------------------------------------------------
4663 | Returns the result of adding the absolute values of the extended double-
4664 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4665 | negated before being returned. `zSign' is ignored if the result is a NaN.
4666 | The addition is performed according to the IEC/IEEE Standard for Binary
4667 | Floating-Point Arithmetic.
4668 *----------------------------------------------------------------------------*/
4670 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4672 int32 aExp, bExp, zExp;
4673 uint64_t aSig, bSig, zSig0, zSig1;
4676 aSig = extractFloatx80Frac( a );
4677 aExp = extractFloatx80Exp( a );
4678 bSig = extractFloatx80Frac( b );
4679 bExp = extractFloatx80Exp( b );
4680 expDiff = aExp - bExp;
4681 if ( 0 < expDiff ) {
4682 if ( aExp == 0x7FFF ) {
4683 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4686 if ( bExp == 0 ) --expDiff;
4687 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4690 else if ( expDiff < 0 ) {
4691 if ( bExp == 0x7FFF ) {
4692 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4693 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4695 if ( aExp == 0 ) ++expDiff;
4696 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4700 if ( aExp == 0x7FFF ) {
4701 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4702 return propagateFloatx80NaN( a, b STATUS_VAR );
4707 zSig0 = aSig + bSig;
4709 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4715 zSig0 = aSig + bSig;
4716 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4718 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4719 zSig0 |= LIT64( 0x8000000000000000 );
4723 roundAndPackFloatx80(
4724 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4728 /*----------------------------------------------------------------------------
4729 | Returns the result of subtracting the absolute values of the extended
4730 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4731 | difference is negated before being returned. `zSign' is ignored if the
4732 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4733 | Standard for Binary Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4736 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4738 int32 aExp, bExp, zExp;
4739 uint64_t aSig, bSig, zSig0, zSig1;
4743 aSig = extractFloatx80Frac( a );
4744 aExp = extractFloatx80Exp( a );
4745 bSig = extractFloatx80Frac( b );
4746 bExp = extractFloatx80Exp( b );
4747 expDiff = aExp - bExp;
4748 if ( 0 < expDiff ) goto aExpBigger;
4749 if ( expDiff < 0 ) goto bExpBigger;
4750 if ( aExp == 0x7FFF ) {
4751 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4752 return propagateFloatx80NaN( a, b STATUS_VAR );
4754 float_raise( float_flag_invalid STATUS_VAR);
4755 z.low = floatx80_default_nan_low;
4756 z.high = floatx80_default_nan_high;
4764 if ( bSig < aSig ) goto aBigger;
4765 if ( aSig < bSig ) goto bBigger;
4766 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4768 if ( bExp == 0x7FFF ) {
4769 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4770 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4772 if ( aExp == 0 ) ++expDiff;
4773 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4775 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4778 goto normalizeRoundAndPack;
4780 if ( aExp == 0x7FFF ) {
4781 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4784 if ( bExp == 0 ) --expDiff;
4785 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4787 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4789 normalizeRoundAndPack:
4791 normalizeRoundAndPackFloatx80(
4792 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4796 /*----------------------------------------------------------------------------
4797 | Returns the result of adding the extended double-precision floating-point
4798 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4799 | Standard for Binary Floating-Point Arithmetic.
4800 *----------------------------------------------------------------------------*/
4802 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4806 aSign = extractFloatx80Sign( a );
4807 bSign = extractFloatx80Sign( b );
4808 if ( aSign == bSign ) {
4809 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4812 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4817 /*----------------------------------------------------------------------------
4818 | Returns the result of subtracting the extended double-precision floating-
4819 | point values `a' and `b'. The operation is performed according to the
4820 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821 *----------------------------------------------------------------------------*/
4823 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4827 aSign = extractFloatx80Sign( a );
4828 bSign = extractFloatx80Sign( b );
4829 if ( aSign == bSign ) {
4830 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4833 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4838 /*----------------------------------------------------------------------------
4839 | Returns the result of multiplying the extended double-precision floating-
4840 | point values `a' and `b'. The operation is performed according to the
4841 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4844 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4846 flag aSign, bSign, zSign;
4847 int32 aExp, bExp, zExp;
4848 uint64_t aSig, bSig, zSig0, zSig1;
4851 aSig = extractFloatx80Frac( a );
4852 aExp = extractFloatx80Exp( a );
4853 aSign = extractFloatx80Sign( a );
4854 bSig = extractFloatx80Frac( b );
4855 bExp = extractFloatx80Exp( b );
4856 bSign = extractFloatx80Sign( b );
4857 zSign = aSign ^ bSign;
4858 if ( aExp == 0x7FFF ) {
4859 if ( (uint64_t) ( aSig<<1 )
4860 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4861 return propagateFloatx80NaN( a, b STATUS_VAR );
4863 if ( ( bExp | bSig ) == 0 ) goto invalid;
4864 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4866 if ( bExp == 0x7FFF ) {
4867 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4868 if ( ( aExp | aSig ) == 0 ) {
4870 float_raise( float_flag_invalid STATUS_VAR);
4871 z.low = floatx80_default_nan_low;
4872 z.high = floatx80_default_nan_high;
4875 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4878 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4879 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4882 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4883 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4885 zExp = aExp + bExp - 0x3FFE;
4886 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4887 if ( 0 < (int64_t) zSig0 ) {
4888 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4892 roundAndPackFloatx80(
4893 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of dividing the extended double-precision floating-point
4899 | value `a' by the corresponding value `b'. The operation is performed
4900 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4901 *----------------------------------------------------------------------------*/
4903 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4905 flag aSign, bSign, zSign;
4906 int32 aExp, bExp, zExp;
4907 uint64_t aSig, bSig, zSig0, zSig1;
4908 uint64_t rem0, rem1, rem2, term0, term1, term2;
4911 aSig = extractFloatx80Frac( a );
4912 aExp = extractFloatx80Exp( a );
4913 aSign = extractFloatx80Sign( a );
4914 bSig = extractFloatx80Frac( b );
4915 bExp = extractFloatx80Exp( b );
4916 bSign = extractFloatx80Sign( b );
4917 zSign = aSign ^ bSign;
4918 if ( aExp == 0x7FFF ) {
4919 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4920 if ( bExp == 0x7FFF ) {
4921 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4924 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4926 if ( bExp == 0x7FFF ) {
4927 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4928 return packFloatx80( zSign, 0, 0 );
4932 if ( ( aExp | aSig ) == 0 ) {
4934 float_raise( float_flag_invalid STATUS_VAR);
4935 z.low = floatx80_default_nan_low;
4936 z.high = floatx80_default_nan_high;
4939 float_raise( float_flag_divbyzero STATUS_VAR);
4940 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4942 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4945 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4946 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4948 zExp = aExp - bExp + 0x3FFE;
4950 if ( bSig <= aSig ) {
4951 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4954 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4955 mul64To128( bSig, zSig0, &term0, &term1 );
4956 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4957 while ( (int64_t) rem0 < 0 ) {
4959 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4961 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4962 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4963 mul64To128( bSig, zSig1, &term1, &term2 );
4964 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4965 while ( (int64_t) rem1 < 0 ) {
4967 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4969 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4972 roundAndPackFloatx80(
4973 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4977 /*----------------------------------------------------------------------------
4978 | Returns the remainder of the extended double-precision floating-point value
4979 | `a' with respect to the corresponding value `b'. The operation is performed
4980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4981 *----------------------------------------------------------------------------*/
4983 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4986 int32 aExp, bExp, expDiff;
4987 uint64_t aSig0, aSig1, bSig;
4988 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4991 aSig0 = extractFloatx80Frac( a );
4992 aExp = extractFloatx80Exp( a );
4993 aSign = extractFloatx80Sign( a );
4994 bSig = extractFloatx80Frac( b );
4995 bExp = extractFloatx80Exp( b );
4996 if ( aExp == 0x7FFF ) {
4997 if ( (uint64_t) ( aSig0<<1 )
4998 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4999 return propagateFloatx80NaN( a, b STATUS_VAR );
5003 if ( bExp == 0x7FFF ) {
5004 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5010 float_raise( float_flag_invalid STATUS_VAR);
5011 z.low = floatx80_default_nan_low;
5012 z.high = floatx80_default_nan_high;
5015 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5018 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5019 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5021 bSig |= LIT64( 0x8000000000000000 );
5023 expDiff = aExp - bExp;
5025 if ( expDiff < 0 ) {
5026 if ( expDiff < -1 ) return a;
5027 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5030 q = ( bSig <= aSig0 );
5031 if ( q ) aSig0 -= bSig;
5033 while ( 0 < expDiff ) {
5034 q = estimateDiv128To64( aSig0, aSig1, bSig );
5035 q = ( 2 < q ) ? q - 2 : 0;
5036 mul64To128( bSig, q, &term0, &term1 );
5037 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5038 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5042 if ( 0 < expDiff ) {
5043 q = estimateDiv128To64( aSig0, aSig1, bSig );
5044 q = ( 2 < q ) ? q - 2 : 0;
5046 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5047 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5048 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5049 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5051 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5058 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5059 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5060 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5063 aSig0 = alternateASig0;
5064 aSig1 = alternateASig1;
5068 normalizeRoundAndPackFloatx80(
5069 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5073 /*----------------------------------------------------------------------------
5074 | Returns the square root of the extended double-precision floating-point
5075 | value `a'. The operation is performed according to the IEC/IEEE Standard
5076 | for Binary Floating-Point Arithmetic.
5077 *----------------------------------------------------------------------------*/
5079 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5083 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5084 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5087 aSig0 = extractFloatx80Frac( a );
5088 aExp = extractFloatx80Exp( a );
5089 aSign = extractFloatx80Sign( a );
5090 if ( aExp == 0x7FFF ) {
5091 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5092 if ( ! aSign ) return a;
5096 if ( ( aExp | aSig0 ) == 0 ) return a;
5098 float_raise( float_flag_invalid STATUS_VAR);
5099 z.low = floatx80_default_nan_low;
5100 z.high = floatx80_default_nan_high;
5104 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5105 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5107 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5108 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5109 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5110 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5111 doubleZSig0 = zSig0<<1;
5112 mul64To128( zSig0, zSig0, &term0, &term1 );
5113 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5114 while ( (int64_t) rem0 < 0 ) {
5117 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5119 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5120 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5121 if ( zSig1 == 0 ) zSig1 = 1;
5122 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5123 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5124 mul64To128( zSig1, zSig1, &term2, &term3 );
5125 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5126 while ( (int64_t) rem1 < 0 ) {
5128 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5130 term2 |= doubleZSig0;
5131 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5133 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5135 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5136 zSig0 |= doubleZSig0;
5138 roundAndPackFloatx80(
5139 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5143 /*----------------------------------------------------------------------------
5144 | Returns 1 if the extended double-precision floating-point value `a' is equal
5145 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5146 | raised if either operand is a NaN. Otherwise, the comparison is performed
5147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5148 *----------------------------------------------------------------------------*/
5150 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5153 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5154 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5155 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5156 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5158 float_raise( float_flag_invalid STATUS_VAR);
5163 && ( ( a.high == b.high )
5165 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5170 /*----------------------------------------------------------------------------
5171 | Returns 1 if the extended double-precision floating-point value `a' is
5172 | less than or equal to the corresponding value `b', and 0 otherwise. The
5173 | invalid exception is raised if either operand is a NaN. The comparison is
5174 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5176 *----------------------------------------------------------------------------*/
5178 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5182 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5183 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5184 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5185 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5187 float_raise( float_flag_invalid STATUS_VAR);
5190 aSign = extractFloatx80Sign( a );
5191 bSign = extractFloatx80Sign( b );
5192 if ( aSign != bSign ) {
5195 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5199 aSign ? le128( b.high, b.low, a.high, a.low )
5200 : le128( a.high, a.low, b.high, b.low );
5204 /*----------------------------------------------------------------------------
5205 | Returns 1 if the extended double-precision floating-point value `a' is
5206 | less than the corresponding value `b', and 0 otherwise. The invalid
5207 | exception is raised if either operand is a NaN. The comparison is performed
5208 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5209 *----------------------------------------------------------------------------*/
5211 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5215 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5216 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5217 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5218 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5220 float_raise( float_flag_invalid STATUS_VAR);
5223 aSign = extractFloatx80Sign( a );
5224 bSign = extractFloatx80Sign( b );
5225 if ( aSign != bSign ) {
5228 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5232 aSign ? lt128( b.high, b.low, a.high, a.low )
5233 : lt128( a.high, a.low, b.high, b.low );
5237 /*----------------------------------------------------------------------------
5238 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5239 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5240 | either operand is a NaN. The comparison is performed according to the
5241 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242 *----------------------------------------------------------------------------*/
5243 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5245 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5246 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5247 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5248 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5250 float_raise( float_flag_invalid STATUS_VAR);
5256 /*----------------------------------------------------------------------------
5257 | Returns 1 if the extended double-precision floating-point value `a' is
5258 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5259 | cause an exception. The comparison is performed according to the IEC/IEEE
5260 | Standard for Binary Floating-Point Arithmetic.
5261 *----------------------------------------------------------------------------*/
5263 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5266 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5267 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5268 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5269 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5271 if ( floatx80_is_signaling_nan( a )
5272 || floatx80_is_signaling_nan( b ) ) {
5273 float_raise( float_flag_invalid STATUS_VAR);
5279 && ( ( a.high == b.high )
5281 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5286 /*----------------------------------------------------------------------------
5287 | Returns 1 if the extended double-precision floating-point value `a' is less
5288 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5289 | do not cause an exception. Otherwise, the comparison is performed according
5290 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5291 *----------------------------------------------------------------------------*/
5293 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5297 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5298 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5299 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5300 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5302 if ( floatx80_is_signaling_nan( a )
5303 || floatx80_is_signaling_nan( b ) ) {
5304 float_raise( float_flag_invalid STATUS_VAR);
5308 aSign = extractFloatx80Sign( a );
5309 bSign = extractFloatx80Sign( b );
5310 if ( aSign != bSign ) {
5313 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5317 aSign ? le128( b.high, b.low, a.high, a.low )
5318 : le128( a.high, a.low, b.high, b.low );
5322 /*----------------------------------------------------------------------------
5323 | Returns 1 if the extended double-precision floating-point value `a' is less
5324 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5325 | an exception. Otherwise, the comparison is performed according to the
5326 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5329 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5333 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5334 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5335 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5336 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5338 if ( floatx80_is_signaling_nan( a )
5339 || floatx80_is_signaling_nan( b ) ) {
5340 float_raise( float_flag_invalid STATUS_VAR);
5344 aSign = extractFloatx80Sign( a );
5345 bSign = extractFloatx80Sign( b );
5346 if ( aSign != bSign ) {
5349 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5353 aSign ? lt128( b.high, b.low, a.high, a.low )
5354 : lt128( a.high, a.low, b.high, b.low );
5358 /*----------------------------------------------------------------------------
5359 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5360 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5361 | The comparison is performed according to the IEC/IEEE Standard for Binary
5362 | Floating-Point Arithmetic.
5363 *----------------------------------------------------------------------------*/
5364 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5366 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5367 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5368 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5369 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5371 if ( floatx80_is_signaling_nan( a )
5372 || floatx80_is_signaling_nan( b ) ) {
5373 float_raise( float_flag_invalid STATUS_VAR);
5380 /*----------------------------------------------------------------------------
5381 | Returns the result of converting the quadruple-precision floating-point
5382 | value `a' to the 32-bit two's complement integer format. The conversion
5383 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5384 | Arithmetic---which means in particular that the conversion is rounded
5385 | according to the current rounding mode. If `a' is a NaN, the largest
5386 | positive integer is returned. Otherwise, if the conversion overflows, the
5387 | largest integer with the same sign as `a' is returned.
5388 *----------------------------------------------------------------------------*/
5390 int32 float128_to_int32( float128 a STATUS_PARAM )
5393 int32 aExp, shiftCount;
5394 uint64_t aSig0, aSig1;
5396 aSig1 = extractFloat128Frac1( a );
5397 aSig0 = extractFloat128Frac0( a );
5398 aExp = extractFloat128Exp( a );
5399 aSign = extractFloat128Sign( a );
5400 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5401 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5402 aSig0 |= ( aSig1 != 0 );
5403 shiftCount = 0x4028 - aExp;
5404 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5405 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the quadruple-precision floating-point
5411 | value `a' to the 32-bit two's complement integer format. The conversion
5412 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5413 | Arithmetic, except that the conversion is always rounded toward zero. If
5414 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5415 | conversion overflows, the largest integer with the same sign as `a' is
5417 *----------------------------------------------------------------------------*/
5419 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5422 int32 aExp, shiftCount;
5423 uint64_t aSig0, aSig1, savedASig;
5426 aSig1 = extractFloat128Frac1( a );
5427 aSig0 = extractFloat128Frac0( a );
5428 aExp = extractFloat128Exp( a );
5429 aSign = extractFloat128Sign( a );
5430 aSig0 |= ( aSig1 != 0 );
5431 if ( 0x401E < aExp ) {
5432 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5435 else if ( aExp < 0x3FFF ) {
5436 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5439 aSig0 |= LIT64( 0x0001000000000000 );
5440 shiftCount = 0x402F - aExp;
5442 aSig0 >>= shiftCount;
5444 if ( aSign ) z = - z;
5445 if ( ( z < 0 ) ^ aSign ) {
5447 float_raise( float_flag_invalid STATUS_VAR);
5448 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5450 if ( ( aSig0<<shiftCount ) != savedASig ) {
5451 STATUS(float_exception_flags) |= float_flag_inexact;
5457 /*----------------------------------------------------------------------------
5458 | Returns the result of converting the quadruple-precision floating-point
5459 | value `a' to the 64-bit two's complement integer format. The conversion
5460 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5461 | Arithmetic---which means in particular that the conversion is rounded
5462 | according to the current rounding mode. If `a' is a NaN, the largest
5463 | positive integer is returned. Otherwise, if the conversion overflows, the
5464 | largest integer with the same sign as `a' is returned.
5465 *----------------------------------------------------------------------------*/
5467 int64 float128_to_int64( float128 a STATUS_PARAM )
5470 int32 aExp, shiftCount;
5471 uint64_t aSig0, aSig1;
5473 aSig1 = extractFloat128Frac1( a );
5474 aSig0 = extractFloat128Frac0( a );
5475 aExp = extractFloat128Exp( a );
5476 aSign = extractFloat128Sign( a );
5477 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5478 shiftCount = 0x402F - aExp;
5479 if ( shiftCount <= 0 ) {
5480 if ( 0x403E < aExp ) {
5481 float_raise( float_flag_invalid STATUS_VAR);
5483 || ( ( aExp == 0x7FFF )
5484 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5487 return LIT64( 0x7FFFFFFFFFFFFFFF );
5489 return (int64_t) LIT64( 0x8000000000000000 );
5491 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5494 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5496 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5500 /*----------------------------------------------------------------------------
5501 | Returns the result of converting the quadruple-precision floating-point
5502 | value `a' to the 64-bit two's complement integer format. The conversion
5503 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5504 | Arithmetic, except that the conversion is always rounded toward zero.
5505 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5506 | the conversion overflows, the largest integer with the same sign as `a' is
5508 *----------------------------------------------------------------------------*/
5510 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5513 int32 aExp, shiftCount;
5514 uint64_t aSig0, aSig1;
5517 aSig1 = extractFloat128Frac1( a );
5518 aSig0 = extractFloat128Frac0( a );
5519 aExp = extractFloat128Exp( a );
5520 aSign = extractFloat128Sign( a );
5521 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5522 shiftCount = aExp - 0x402F;
5523 if ( 0 < shiftCount ) {
5524 if ( 0x403E <= aExp ) {
5525 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5526 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5527 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5528 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5531 float_raise( float_flag_invalid STATUS_VAR);
5532 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5533 return LIT64( 0x7FFFFFFFFFFFFFFF );
5536 return (int64_t) LIT64( 0x8000000000000000 );
5538 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5539 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5540 STATUS(float_exception_flags) |= float_flag_inexact;
5544 if ( aExp < 0x3FFF ) {
5545 if ( aExp | aSig0 | aSig1 ) {
5546 STATUS(float_exception_flags) |= float_flag_inexact;
5550 z = aSig0>>( - shiftCount );
5552 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5553 STATUS(float_exception_flags) |= float_flag_inexact;
5556 if ( aSign ) z = - z;
5561 /*----------------------------------------------------------------------------
5562 | Returns the result of converting the quadruple-precision floating-point
5563 | value `a' to the single-precision floating-point format. The conversion
5564 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5566 *----------------------------------------------------------------------------*/
5568 float32 float128_to_float32( float128 a STATUS_PARAM )
5572 uint64_t aSig0, aSig1;
5575 aSig1 = extractFloat128Frac1( a );
5576 aSig0 = extractFloat128Frac0( a );
5577 aExp = extractFloat128Exp( a );
5578 aSign = extractFloat128Sign( a );
5579 if ( aExp == 0x7FFF ) {
5580 if ( aSig0 | aSig1 ) {
5581 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5583 return packFloat32( aSign, 0xFF, 0 );
5585 aSig0 |= ( aSig1 != 0 );
5586 shift64RightJamming( aSig0, 18, &aSig0 );
5588 if ( aExp || zSig ) {
5592 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5596 /*----------------------------------------------------------------------------
5597 | Returns the result of converting the quadruple-precision floating-point
5598 | value `a' to the double-precision floating-point format. The conversion
5599 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5601 *----------------------------------------------------------------------------*/
5603 float64 float128_to_float64( float128 a STATUS_PARAM )
5607 uint64_t aSig0, aSig1;
5609 aSig1 = extractFloat128Frac1( a );
5610 aSig0 = extractFloat128Frac0( a );
5611 aExp = extractFloat128Exp( a );
5612 aSign = extractFloat128Sign( a );
5613 if ( aExp == 0x7FFF ) {
5614 if ( aSig0 | aSig1 ) {
5615 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5617 return packFloat64( aSign, 0x7FF, 0 );
5619 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5620 aSig0 |= ( aSig1 != 0 );
5621 if ( aExp || aSig0 ) {
5622 aSig0 |= LIT64( 0x4000000000000000 );
5625 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5629 /*----------------------------------------------------------------------------
5630 | Returns the result of converting the quadruple-precision floating-point
5631 | value `a' to the extended double-precision floating-point format. The
5632 | conversion is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic.
5634 *----------------------------------------------------------------------------*/
5636 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5640 uint64_t aSig0, aSig1;
5642 aSig1 = extractFloat128Frac1( a );
5643 aSig0 = extractFloat128Frac0( a );
5644 aExp = extractFloat128Exp( a );
5645 aSign = extractFloat128Sign( a );
5646 if ( aExp == 0x7FFF ) {
5647 if ( aSig0 | aSig1 ) {
5648 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5650 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5653 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5654 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5657 aSig0 |= LIT64( 0x0001000000000000 );
5659 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5660 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5664 /*----------------------------------------------------------------------------
5665 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5666 | returns the result as a quadruple-precision floating-point value. The
5667 | operation is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic.
5669 *----------------------------------------------------------------------------*/
5671 float128 float128_round_to_int( float128 a STATUS_PARAM )
5675 uint64_t lastBitMask, roundBitsMask;
5679 aExp = extractFloat128Exp( a );
5680 if ( 0x402F <= aExp ) {
5681 if ( 0x406F <= aExp ) {
5682 if ( ( aExp == 0x7FFF )
5683 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5685 return propagateFloat128NaN( a, a STATUS_VAR );
5690 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5691 roundBitsMask = lastBitMask - 1;
5693 roundingMode = STATUS(float_rounding_mode);
5694 if ( roundingMode == float_round_nearest_even ) {
5695 if ( lastBitMask ) {
5696 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5697 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5700 if ( (int64_t) z.low < 0 ) {
5702 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5706 else if ( roundingMode != float_round_to_zero ) {
5707 if ( extractFloat128Sign( z )
5708 ^ ( roundingMode == float_round_up ) ) {
5709 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5712 z.low &= ~ roundBitsMask;
5715 if ( aExp < 0x3FFF ) {
5716 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5717 STATUS(float_exception_flags) |= float_flag_inexact;
5718 aSign = extractFloat128Sign( a );
5719 switch ( STATUS(float_rounding_mode) ) {
5720 case float_round_nearest_even:
5721 if ( ( aExp == 0x3FFE )
5722 && ( extractFloat128Frac0( a )
5723 | extractFloat128Frac1( a ) )
5725 return packFloat128( aSign, 0x3FFF, 0, 0 );
5728 case float_round_down:
5730 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5731 : packFloat128( 0, 0, 0, 0 );
5732 case float_round_up:
5734 aSign ? packFloat128( 1, 0, 0, 0 )
5735 : packFloat128( 0, 0x3FFF, 0, 0 );
5737 return packFloat128( aSign, 0, 0, 0 );
5740 lastBitMask <<= 0x402F - aExp;
5741 roundBitsMask = lastBitMask - 1;
5744 roundingMode = STATUS(float_rounding_mode);
5745 if ( roundingMode == float_round_nearest_even ) {
5746 z.high += lastBitMask>>1;
5747 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5748 z.high &= ~ lastBitMask;
5751 else if ( roundingMode != float_round_to_zero ) {
5752 if ( extractFloat128Sign( z )
5753 ^ ( roundingMode == float_round_up ) ) {
5754 z.high |= ( a.low != 0 );
5755 z.high += roundBitsMask;
5758 z.high &= ~ roundBitsMask;
5760 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5761 STATUS(float_exception_flags) |= float_flag_inexact;
5767 /*----------------------------------------------------------------------------
5768 | Returns the result of adding the absolute values of the quadruple-precision
5769 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5770 | before being returned. `zSign' is ignored if the result is a NaN.
5771 | The addition is performed according to the IEC/IEEE Standard for Binary
5772 | Floating-Point Arithmetic.
5773 *----------------------------------------------------------------------------*/
5775 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5777 int32 aExp, bExp, zExp;
5778 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5781 aSig1 = extractFloat128Frac1( a );
5782 aSig0 = extractFloat128Frac0( a );
5783 aExp = extractFloat128Exp( a );
5784 bSig1 = extractFloat128Frac1( b );
5785 bSig0 = extractFloat128Frac0( b );
5786 bExp = extractFloat128Exp( b );
5787 expDiff = aExp - bExp;
5788 if ( 0 < expDiff ) {
5789 if ( aExp == 0x7FFF ) {
5790 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5797 bSig0 |= LIT64( 0x0001000000000000 );
5799 shift128ExtraRightJamming(
5800 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5803 else if ( expDiff < 0 ) {
5804 if ( bExp == 0x7FFF ) {
5805 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5806 return packFloat128( zSign, 0x7FFF, 0, 0 );
5812 aSig0 |= LIT64( 0x0001000000000000 );
5814 shift128ExtraRightJamming(
5815 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5819 if ( aExp == 0x7FFF ) {
5820 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5821 return propagateFloat128NaN( a, b STATUS_VAR );
5825 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5827 if (STATUS(flush_to_zero)) {
5828 if (zSig0 | zSig1) {
5829 float_raise(float_flag_output_denormal STATUS_VAR);
5831 return packFloat128(zSign, 0, 0, 0);
5833 return packFloat128( zSign, 0, zSig0, zSig1 );
5836 zSig0 |= LIT64( 0x0002000000000000 );
5840 aSig0 |= LIT64( 0x0001000000000000 );
5841 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5843 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5846 shift128ExtraRightJamming(
5847 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5849 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5853 /*----------------------------------------------------------------------------
5854 | Returns the result of subtracting the absolute values of the quadruple-
5855 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5856 | difference is negated before being returned. `zSign' is ignored if the
5857 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5858 | Standard for Binary Floating-Point Arithmetic.
5859 *----------------------------------------------------------------------------*/
5861 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5863 int32 aExp, bExp, zExp;
5864 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5868 aSig1 = extractFloat128Frac1( a );
5869 aSig0 = extractFloat128Frac0( a );
5870 aExp = extractFloat128Exp( a );
5871 bSig1 = extractFloat128Frac1( b );
5872 bSig0 = extractFloat128Frac0( b );
5873 bExp = extractFloat128Exp( b );
5874 expDiff = aExp - bExp;
5875 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5876 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5877 if ( 0 < expDiff ) goto aExpBigger;
5878 if ( expDiff < 0 ) goto bExpBigger;
5879 if ( aExp == 0x7FFF ) {
5880 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5881 return propagateFloat128NaN( a, b STATUS_VAR );
5883 float_raise( float_flag_invalid STATUS_VAR);
5884 z.low = float128_default_nan_low;
5885 z.high = float128_default_nan_high;
5892 if ( bSig0 < aSig0 ) goto aBigger;
5893 if ( aSig0 < bSig0 ) goto bBigger;
5894 if ( bSig1 < aSig1 ) goto aBigger;
5895 if ( aSig1 < bSig1 ) goto bBigger;
5896 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5898 if ( bExp == 0x7FFF ) {
5899 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5900 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5906 aSig0 |= LIT64( 0x4000000000000000 );
5908 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5909 bSig0 |= LIT64( 0x4000000000000000 );
5911 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5914 goto normalizeRoundAndPack;
5916 if ( aExp == 0x7FFF ) {
5917 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5924 bSig0 |= LIT64( 0x4000000000000000 );
5926 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5927 aSig0 |= LIT64( 0x4000000000000000 );
5929 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5931 normalizeRoundAndPack:
5933 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5937 /*----------------------------------------------------------------------------
5938 | Returns the result of adding the quadruple-precision floating-point values
5939 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5940 | for Binary Floating-Point Arithmetic.
5941 *----------------------------------------------------------------------------*/
5943 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5947 aSign = extractFloat128Sign( a );
5948 bSign = extractFloat128Sign( b );
5949 if ( aSign == bSign ) {
5950 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5953 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5958 /*----------------------------------------------------------------------------
5959 | Returns the result of subtracting the quadruple-precision floating-point
5960 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5961 | Standard for Binary Floating-Point Arithmetic.
5962 *----------------------------------------------------------------------------*/
5964 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5968 aSign = extractFloat128Sign( a );
5969 bSign = extractFloat128Sign( b );
5970 if ( aSign == bSign ) {
5971 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5974 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5979 /*----------------------------------------------------------------------------
5980 | Returns the result of multiplying the quadruple-precision floating-point
5981 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5982 | Standard for Binary Floating-Point Arithmetic.
5983 *----------------------------------------------------------------------------*/
5985 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5987 flag aSign, bSign, zSign;
5988 int32 aExp, bExp, zExp;
5989 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5992 aSig1 = extractFloat128Frac1( a );
5993 aSig0 = extractFloat128Frac0( a );
5994 aExp = extractFloat128Exp( a );
5995 aSign = extractFloat128Sign( a );
5996 bSig1 = extractFloat128Frac1( b );
5997 bSig0 = extractFloat128Frac0( b );
5998 bExp = extractFloat128Exp( b );
5999 bSign = extractFloat128Sign( b );
6000 zSign = aSign ^ bSign;
6001 if ( aExp == 0x7FFF ) {
6002 if ( ( aSig0 | aSig1 )
6003 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6004 return propagateFloat128NaN( a, b STATUS_VAR );
6006 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6007 return packFloat128( zSign, 0x7FFF, 0, 0 );
6009 if ( bExp == 0x7FFF ) {
6010 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6011 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6013 float_raise( float_flag_invalid STATUS_VAR);
6014 z.low = float128_default_nan_low;
6015 z.high = float128_default_nan_high;
6018 return packFloat128( zSign, 0x7FFF, 0, 0 );
6021 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6022 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6025 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6026 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6028 zExp = aExp + bExp - 0x4000;
6029 aSig0 |= LIT64( 0x0001000000000000 );
6030 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6031 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6032 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6033 zSig2 |= ( zSig3 != 0 );
6034 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6035 shift128ExtraRightJamming(
6036 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6039 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6043 /*----------------------------------------------------------------------------
6044 | Returns the result of dividing the quadruple-precision floating-point value
6045 | `a' by the corresponding value `b'. The operation is performed according to
6046 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6047 *----------------------------------------------------------------------------*/
6049 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6051 flag aSign, bSign, zSign;
6052 int32 aExp, bExp, zExp;
6053 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6054 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6057 aSig1 = extractFloat128Frac1( a );
6058 aSig0 = extractFloat128Frac0( a );
6059 aExp = extractFloat128Exp( a );
6060 aSign = extractFloat128Sign( a );
6061 bSig1 = extractFloat128Frac1( b );
6062 bSig0 = extractFloat128Frac0( b );
6063 bExp = extractFloat128Exp( b );
6064 bSign = extractFloat128Sign( b );
6065 zSign = aSign ^ bSign;
6066 if ( aExp == 0x7FFF ) {
6067 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6068 if ( bExp == 0x7FFF ) {
6069 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6072 return packFloat128( zSign, 0x7FFF, 0, 0 );
6074 if ( bExp == 0x7FFF ) {
6075 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6076 return packFloat128( zSign, 0, 0, 0 );
6079 if ( ( bSig0 | bSig1 ) == 0 ) {
6080 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6082 float_raise( float_flag_invalid STATUS_VAR);
6083 z.low = float128_default_nan_low;
6084 z.high = float128_default_nan_high;
6087 float_raise( float_flag_divbyzero STATUS_VAR);
6088 return packFloat128( zSign, 0x7FFF, 0, 0 );
6090 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6093 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6094 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6096 zExp = aExp - bExp + 0x3FFD;
6098 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6100 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6101 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6102 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6105 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6106 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6107 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6108 while ( (int64_t) rem0 < 0 ) {
6110 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6112 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6113 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6114 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6115 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6116 while ( (int64_t) rem1 < 0 ) {
6118 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6120 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6122 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6123 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6127 /*----------------------------------------------------------------------------
6128 | Returns the remainder of the quadruple-precision floating-point value `a'
6129 | with respect to the corresponding value `b'. The operation is performed
6130 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6131 *----------------------------------------------------------------------------*/
6133 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6136 int32 aExp, bExp, expDiff;
6137 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6138 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6142 aSig1 = extractFloat128Frac1( a );
6143 aSig0 = extractFloat128Frac0( a );
6144 aExp = extractFloat128Exp( a );
6145 aSign = extractFloat128Sign( a );
6146 bSig1 = extractFloat128Frac1( b );
6147 bSig0 = extractFloat128Frac0( b );
6148 bExp = extractFloat128Exp( b );
6149 if ( aExp == 0x7FFF ) {
6150 if ( ( aSig0 | aSig1 )
6151 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6152 return propagateFloat128NaN( a, b STATUS_VAR );
6156 if ( bExp == 0x7FFF ) {
6157 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6161 if ( ( bSig0 | bSig1 ) == 0 ) {
6163 float_raise( float_flag_invalid STATUS_VAR);
6164 z.low = float128_default_nan_low;
6165 z.high = float128_default_nan_high;
6168 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6171 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6172 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6174 expDiff = aExp - bExp;
6175 if ( expDiff < -1 ) return a;
6177 aSig0 | LIT64( 0x0001000000000000 ),
6179 15 - ( expDiff < 0 ),
6184 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6185 q = le128( bSig0, bSig1, aSig0, aSig1 );
6186 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6188 while ( 0 < expDiff ) {
6189 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6190 q = ( 4 < q ) ? q - 4 : 0;
6191 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6192 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6193 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6194 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6197 if ( -64 < expDiff ) {
6198 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6199 q = ( 4 < q ) ? q - 4 : 0;
6201 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6203 if ( expDiff < 0 ) {
6204 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6207 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6209 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6210 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6213 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6214 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6217 alternateASig0 = aSig0;
6218 alternateASig1 = aSig1;
6220 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6221 } while ( 0 <= (int64_t) aSig0 );
6223 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6224 if ( ( sigMean0 < 0 )
6225 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6226 aSig0 = alternateASig0;
6227 aSig1 = alternateASig1;
6229 zSign = ( (int64_t) aSig0 < 0 );
6230 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6232 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6236 /*----------------------------------------------------------------------------
6237 | Returns the square root of the quadruple-precision floating-point value `a'.
6238 | The operation is performed according to the IEC/IEEE Standard for Binary
6239 | Floating-Point Arithmetic.
6240 *----------------------------------------------------------------------------*/
6242 float128 float128_sqrt( float128 a STATUS_PARAM )
6246 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6247 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6250 aSig1 = extractFloat128Frac1( a );
6251 aSig0 = extractFloat128Frac0( a );
6252 aExp = extractFloat128Exp( a );
6253 aSign = extractFloat128Sign( a );
6254 if ( aExp == 0x7FFF ) {
6255 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6256 if ( ! aSign ) return a;
6260 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6262 float_raise( float_flag_invalid STATUS_VAR);
6263 z.low = float128_default_nan_low;
6264 z.high = float128_default_nan_high;
6268 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6269 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6271 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6272 aSig0 |= LIT64( 0x0001000000000000 );
6273 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6274 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6275 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6276 doubleZSig0 = zSig0<<1;
6277 mul64To128( zSig0, zSig0, &term0, &term1 );
6278 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6279 while ( (int64_t) rem0 < 0 ) {
6282 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6284 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6285 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6286 if ( zSig1 == 0 ) zSig1 = 1;
6287 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6288 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6289 mul64To128( zSig1, zSig1, &term2, &term3 );
6290 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6291 while ( (int64_t) rem1 < 0 ) {
6293 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6295 term2 |= doubleZSig0;
6296 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6298 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6300 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6301 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6305 /*----------------------------------------------------------------------------
6306 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6307 | the corresponding value `b', and 0 otherwise. The invalid exception is
6308 | raised if either operand is a NaN. Otherwise, the comparison is performed
6309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6310 *----------------------------------------------------------------------------*/
6312 int float128_eq( float128 a, float128 b STATUS_PARAM )
6315 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6316 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6317 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6318 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6320 float_raise( float_flag_invalid STATUS_VAR);
6325 && ( ( a.high == b.high )
6327 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6332 /*----------------------------------------------------------------------------
6333 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6334 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6335 | exception is raised if either operand is a NaN. The comparison is performed
6336 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6337 *----------------------------------------------------------------------------*/
6339 int float128_le( float128 a, float128 b STATUS_PARAM )
6343 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6344 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6345 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6346 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6348 float_raise( float_flag_invalid STATUS_VAR);
6351 aSign = extractFloat128Sign( a );
6352 bSign = extractFloat128Sign( b );
6353 if ( aSign != bSign ) {
6356 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6360 aSign ? le128( b.high, b.low, a.high, a.low )
6361 : le128( a.high, a.low, b.high, b.low );
6365 /*----------------------------------------------------------------------------
6366 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6367 | the corresponding value `b', and 0 otherwise. The invalid exception is
6368 | raised if either operand is a NaN. The comparison is performed according
6369 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6370 *----------------------------------------------------------------------------*/
6372 int float128_lt( float128 a, float128 b STATUS_PARAM )
6376 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6377 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6378 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6379 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6381 float_raise( float_flag_invalid STATUS_VAR);
6384 aSign = extractFloat128Sign( a );
6385 bSign = extractFloat128Sign( b );
6386 if ( aSign != bSign ) {
6389 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6393 aSign ? lt128( b.high, b.low, a.high, a.low )
6394 : lt128( a.high, a.low, b.high, b.low );
6398 /*----------------------------------------------------------------------------
6399 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6400 | be compared, and 0 otherwise. The invalid exception is raised if either
6401 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6402 | Standard for Binary Floating-Point Arithmetic.
6403 *----------------------------------------------------------------------------*/
6405 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6407 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6408 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6409 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6410 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6412 float_raise( float_flag_invalid STATUS_VAR);
6418 /*----------------------------------------------------------------------------
6419 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6420 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6421 | exception. The comparison is performed according to the IEC/IEEE Standard
6422 | for Binary Floating-Point Arithmetic.
6423 *----------------------------------------------------------------------------*/
6425 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6428 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6429 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6430 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6431 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6433 if ( float128_is_signaling_nan( a )
6434 || float128_is_signaling_nan( b ) ) {
6435 float_raise( float_flag_invalid STATUS_VAR);
6441 && ( ( a.high == b.high )
6443 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6448 /*----------------------------------------------------------------------------
6449 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6450 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6451 | cause an exception. Otherwise, the comparison is performed according to the
6452 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6453 *----------------------------------------------------------------------------*/
6455 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6459 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6460 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6461 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6462 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6464 if ( float128_is_signaling_nan( a )
6465 || float128_is_signaling_nan( b ) ) {
6466 float_raise( float_flag_invalid STATUS_VAR);
6470 aSign = extractFloat128Sign( a );
6471 bSign = extractFloat128Sign( b );
6472 if ( aSign != bSign ) {
6475 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6479 aSign ? le128( b.high, b.low, a.high, a.low )
6480 : le128( a.high, a.low, b.high, b.low );
6484 /*----------------------------------------------------------------------------
6485 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6486 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6487 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6488 | Standard for Binary Floating-Point Arithmetic.
6489 *----------------------------------------------------------------------------*/
6491 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6495 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6496 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6497 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6498 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6500 if ( float128_is_signaling_nan( a )
6501 || float128_is_signaling_nan( b ) ) {
6502 float_raise( float_flag_invalid STATUS_VAR);
6506 aSign = extractFloat128Sign( a );
6507 bSign = extractFloat128Sign( b );
6508 if ( aSign != bSign ) {
6511 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6515 aSign ? lt128( b.high, b.low, a.high, a.low )
6516 : lt128( a.high, a.low, b.high, b.low );
6520 /*----------------------------------------------------------------------------
6521 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6522 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6523 | comparison is performed according to the IEC/IEEE Standard for Binary
6524 | Floating-Point Arithmetic.
6525 *----------------------------------------------------------------------------*/
6527 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6529 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6530 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6531 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6532 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6534 if ( float128_is_signaling_nan( a )
6535 || float128_is_signaling_nan( b ) ) {
6536 float_raise( float_flag_invalid STATUS_VAR);
6543 /* misc functions */
6544 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6546 return int64_to_float32(a STATUS_VAR);
6549 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6551 return int64_to_float64(a STATUS_VAR);
6554 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6558 int old_exc_flags = get_float_exception_flags(status);
6560 v = float32_to_int64(a STATUS_VAR);
6563 } else if (v > 0xffffffff) {
6568 set_float_exception_flags(old_exc_flags, status);
6569 float_raise(float_flag_invalid STATUS_VAR);
6573 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6577 int old_exc_flags = get_float_exception_flags(status);
6579 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6582 } else if (v > 0xffffffff) {
6587 set_float_exception_flags(old_exc_flags, status);
6588 float_raise(float_flag_invalid STATUS_VAR);
6592 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6596 int old_exc_flags = get_float_exception_flags(status);
6598 v = float32_to_int32(a STATUS_VAR);
6601 } else if (v > 0x7fff) {
6607 set_float_exception_flags(old_exc_flags, status);
6608 float_raise(float_flag_invalid STATUS_VAR);
6612 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6616 int old_exc_flags = get_float_exception_flags(status);
6618 v = float32_to_int32(a STATUS_VAR);
6621 } else if (v > 0xffff) {
6627 set_float_exception_flags(old_exc_flags, status);
6628 float_raise(float_flag_invalid STATUS_VAR);
6632 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6636 int old_exc_flags = get_float_exception_flags(status);
6638 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6641 } else if (v > 0xffff) {
6646 set_float_exception_flags(old_exc_flags, status);
6647 float_raise(float_flag_invalid STATUS_VAR);
6651 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6655 int old_exc_flags = get_float_exception_flags(status);
6657 v = float64_to_uint64(a STATUS_VAR);
6658 if (v > 0xffffffff) {
6663 set_float_exception_flags(old_exc_flags, status);
6664 float_raise(float_flag_invalid STATUS_VAR);
6668 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6673 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6676 float_raise( float_flag_invalid STATUS_VAR);
6677 } else if (v > 0xffffffff) {
6679 float_raise( float_flag_invalid STATUS_VAR);
6686 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6690 int old_exc_flags = get_float_exception_flags(status);
6692 v = float64_to_int32(a STATUS_VAR);
6695 } else if (v > 0x7fff) {
6701 set_float_exception_flags(old_exc_flags, status);
6702 float_raise(float_flag_invalid STATUS_VAR);
6706 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6710 int old_exc_flags = get_float_exception_flags(status);
6712 v = float64_to_int32(a STATUS_VAR);
6715 } else if (v > 0xffff) {
6721 set_float_exception_flags(old_exc_flags, status);
6722 float_raise(float_flag_invalid STATUS_VAR);
6726 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6730 int old_exc_flags = get_float_exception_flags(status);
6732 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6735 } else if (v > 0xffff) {
6740 set_float_exception_flags(old_exc_flags, status);
6741 float_raise(float_flag_invalid STATUS_VAR);
6745 /*----------------------------------------------------------------------------
6746 | Returns the result of converting the double-precision floating-point value
6747 | `a' to the 64-bit unsigned integer format. The conversion is
6748 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6749 | Arithmetic---which means in particular that the conversion is rounded
6750 | according to the current rounding mode. If `a' is a NaN, the largest
6751 | positive integer is returned. If the conversion overflows, the
6752 | largest unsigned integer is returned. If 'a' is negative, the value is
6753 | rounded and zero is returned; negative values that do not round to zero
6754 | will raise the inexact exception.
6755 *----------------------------------------------------------------------------*/
6757 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6760 int_fast16_t aExp, shiftCount;
6761 uint64_t aSig, aSigExtra;
6762 a = float64_squash_input_denormal(a STATUS_VAR);
6764 aSig = extractFloat64Frac(a);
6765 aExp = extractFloat64Exp(a);
6766 aSign = extractFloat64Sign(a);
6767 if (aSign && (aExp > 1022)) {
6768 float_raise(float_flag_invalid STATUS_VAR);
6769 if (float64_is_any_nan(a)) {
6770 return LIT64(0xFFFFFFFFFFFFFFFF);
6776 aSig |= LIT64(0x0010000000000000);
6778 shiftCount = 0x433 - aExp;
6779 if (shiftCount <= 0) {
6781 float_raise(float_flag_invalid STATUS_VAR);
6782 return LIT64(0xFFFFFFFFFFFFFFFF);
6785 aSig <<= -shiftCount;
6787 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6789 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6792 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6794 signed char current_rounding_mode = STATUS(float_rounding_mode);
6795 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6796 int64_t v = float64_to_uint64(a STATUS_VAR);
6797 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6801 #define COMPARE(s, nan_exp) \
6802 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6803 int is_quiet STATUS_PARAM ) \
6805 flag aSign, bSign; \
6806 uint ## s ## _t av, bv; \
6807 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6808 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6810 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6811 extractFloat ## s ## Frac( a ) ) || \
6812 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6813 extractFloat ## s ## Frac( b ) )) { \
6815 float ## s ## _is_signaling_nan( a ) || \
6816 float ## s ## _is_signaling_nan( b ) ) { \
6817 float_raise( float_flag_invalid STATUS_VAR); \
6819 return float_relation_unordered; \
6821 aSign = extractFloat ## s ## Sign( a ); \
6822 bSign = extractFloat ## s ## Sign( b ); \
6823 av = float ## s ## _val(a); \
6824 bv = float ## s ## _val(b); \
6825 if ( aSign != bSign ) { \
6826 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6828 return float_relation_equal; \
6830 return 1 - (2 * aSign); \
6834 return float_relation_equal; \
6836 return 1 - 2 * (aSign ^ ( av < bv )); \
6841 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6843 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6846 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6848 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6854 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6855 int is_quiet STATUS_PARAM )
6859 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6860 ( extractFloatx80Frac( a )<<1 ) ) ||
6861 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6862 ( extractFloatx80Frac( b )<<1 ) )) {
6864 floatx80_is_signaling_nan( a ) ||
6865 floatx80_is_signaling_nan( b ) ) {
6866 float_raise( float_flag_invalid STATUS_VAR);
6868 return float_relation_unordered;
6870 aSign = extractFloatx80Sign( a );
6871 bSign = extractFloatx80Sign( b );
6872 if ( aSign != bSign ) {
6874 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6875 ( ( a.low | b.low ) == 0 ) ) {
6877 return float_relation_equal;
6879 return 1 - (2 * aSign);
6882 if (a.low == b.low && a.high == b.high) {
6883 return float_relation_equal;
6885 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6890 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6892 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6895 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6897 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6900 INLINE int float128_compare_internal( float128 a, float128 b,
6901 int is_quiet STATUS_PARAM )
6905 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6906 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6907 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6908 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6910 float128_is_signaling_nan( a ) ||
6911 float128_is_signaling_nan( b ) ) {
6912 float_raise( float_flag_invalid STATUS_VAR);
6914 return float_relation_unordered;
6916 aSign = extractFloat128Sign( a );
6917 bSign = extractFloat128Sign( b );
6918 if ( aSign != bSign ) {
6919 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6921 return float_relation_equal;
6923 return 1 - (2 * aSign);
6926 if (a.low == b.low && a.high == b.high) {
6927 return float_relation_equal;
6929 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6934 int float128_compare( float128 a, float128 b STATUS_PARAM )
6936 return float128_compare_internal(a, b, 0 STATUS_VAR);
6939 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6941 return float128_compare_internal(a, b, 1 STATUS_VAR);
6944 /* min() and max() functions. These can't be implemented as
6945 * 'compare and pick one input' because that would mishandle
6946 * NaNs and +0 vs -0.
6948 * minnum() and maxnum() functions. These are similar to the min()
6949 * and max() functions but if one of the arguments is a QNaN and
6950 * the other is numerical then the numerical argument is returned.
6951 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6952 * and maxNum() operations. min() and max() are the typical min/max
6953 * semantics provided by many CPUs which predate that specification.
6956 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6957 int ismin, int isieee STATUS_PARAM) \
6959 flag aSign, bSign; \
6960 uint ## s ## _t av, bv; \
6961 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6962 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6963 if (float ## s ## _is_any_nan(a) || \
6964 float ## s ## _is_any_nan(b)) { \
6966 if (float ## s ## _is_quiet_nan(a) && \
6967 !float ## s ##_is_any_nan(b)) { \
6969 } else if (float ## s ## _is_quiet_nan(b) && \
6970 !float ## s ## _is_any_nan(a)) { \
6974 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6976 aSign = extractFloat ## s ## Sign(a); \
6977 bSign = extractFloat ## s ## Sign(b); \
6978 av = float ## s ## _val(a); \
6979 bv = float ## s ## _val(b); \
6980 if (aSign != bSign) { \
6982 return aSign ? a : b; \
6984 return aSign ? b : a; \
6988 return (aSign ^ (av < bv)) ? a : b; \
6990 return (aSign ^ (av < bv)) ? b : a; \
6995 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6997 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
7000 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7002 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7005 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7007 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7010 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7012 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7019 /* Multiply A by 2 raised to the power N. */
7020 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7026 a = float32_squash_input_denormal(a STATUS_VAR);
7027 aSig = extractFloat32Frac( a );
7028 aExp = extractFloat32Exp( a );
7029 aSign = extractFloat32Sign( a );
7031 if ( aExp == 0xFF ) {
7033 return propagateFloat32NaN( a, a STATUS_VAR );
7039 } else if (aSig == 0) {
7047 } else if (n < -0x200) {
7053 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7056 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7062 a = float64_squash_input_denormal(a STATUS_VAR);
7063 aSig = extractFloat64Frac( a );
7064 aExp = extractFloat64Exp( a );
7065 aSign = extractFloat64Sign( a );
7067 if ( aExp == 0x7FF ) {
7069 return propagateFloat64NaN( a, a STATUS_VAR );
7074 aSig |= LIT64( 0x0010000000000000 );
7075 } else if (aSig == 0) {
7083 } else if (n < -0x1000) {
7089 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7092 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7098 aSig = extractFloatx80Frac( a );
7099 aExp = extractFloatx80Exp( a );
7100 aSign = extractFloatx80Sign( a );
7102 if ( aExp == 0x7FFF ) {
7104 return propagateFloatx80NaN( a, a STATUS_VAR );
7118 } else if (n < -0x10000) {
7123 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7124 aSign, aExp, aSig, 0 STATUS_VAR );
7127 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7131 uint64_t aSig0, aSig1;
7133 aSig1 = extractFloat128Frac1( a );
7134 aSig0 = extractFloat128Frac0( a );
7135 aExp = extractFloat128Exp( a );
7136 aSign = extractFloat128Sign( a );
7137 if ( aExp == 0x7FFF ) {
7138 if ( aSig0 | aSig1 ) {
7139 return propagateFloat128NaN( a, a STATUS_VAR );
7144 aSig0 |= LIT64( 0x0001000000000000 );
7145 } else if (aSig0 == 0 && aSig1 == 0) {
7153 } else if (n < -0x10000) {
7158 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1