4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
199 float_class_qnan, /* all NaNs from here */
202 float_class_msnan, /* maybe silenced */
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
247 uint64_t roundeven_mask;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
266 static const FloatFmt float32_params = {
270 static const FloatFmt float64_params = {
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
340 part.cls = float_class_qnan;
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
393 case float_round_ties_away:
394 overflow_norm = false;
397 case float_round_to_zero:
398 overflow_norm = true;
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
431 p.cls = float_class_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
452 flags |= float_flag_inexact;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
468 case float_class_zero:
474 case float_class_inf:
480 case float_class_qnan:
481 case float_class_snan:
486 g_assert_not_reached();
489 float_raise(flags, s);
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
577 g_assert_not_reached();
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
594 (a.frac == b.frac && a.sign < b.sign))) {
597 a.cls = float_class_msnan;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
625 a.cls = float_class_dnan;
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
692 if (b.cls == float_class_zero) {
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
890 bool sign_flip = flags & float_muladd_negate_result;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
910 if (flags & float_muladd_negate_c) {
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
951 } else if (flags & float_muladd_halve_result) {
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
992 /* shift c to the same binary point as the product (124) */
995 shift128RightJamming(c_hi, c_lo,
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1027 shift128RightJamming(c_hi, c_lo,
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1053 lo = lo << (shift - 64);
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1063 if (flags & float_muladd_halve_result) {
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1232 case float_round_to_zero:
1235 case float_round_up:
1238 case float_round_down:
1242 g_assert_not_reached();
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1249 a.cls = float_class_zero;
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1262 case float_round_ties_away:
1265 case float_round_to_zero:
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1290 g_assert_not_reached();
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1343 case float_class_snan:
1344 case float_class_qnan:
1346 case float_class_inf:
1347 return p.sign ? min : max;
1348 case float_class_zero:
1350 case float_class_normal:
1351 if (p.exp < DECOMPOSED_BINARY_POINT) {
1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359 if (r < -(uint64_t) min) {
1362 s->float_exception_flags = orig_flags | float_flag_invalid;
1369 s->float_exception_flags = orig_flags | float_flag_invalid;
1374 g_assert_not_reached();
1378 #define FLOAT_TO_INT(fsz, isz) \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1383 return round_to_int_and_pack(p, s->float_rounding_mode, \
1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1389 (float ## fsz a, float_status *s) \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, float_round_to_zero, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1412 * Returns the result of converting the floating-point value `a' to
1413 * the unsigned integer format. The conversion is performed according
1414 * to the IEC/IEEE Standard for Binary Floating-Point
1415 * Arithmetic---which means in particular that the conversion is
1416 * rounded according to the current rounding mode. If `a' is a NaN,
1417 * the largest unsigned integer is returned. Otherwise, if the
1418 * conversion overflows, the largest unsigned integer is returned. If
1419 * the 'a' is negative, the result is rounded and zero is returned;
1420 * values that do not round to zero will raise the inexact exception
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1427 int orig_flags = get_float_exception_flags(s);
1428 FloatParts p = round_to_int(in, rmode, s);
1431 case float_class_snan:
1432 case float_class_qnan:
1433 s->float_exception_flags = orig_flags | float_flag_invalid;
1435 case float_class_inf:
1436 return p.sign ? 0 : max;
1437 case float_class_zero:
1439 case float_class_normal:
1443 s->float_exception_flags = orig_flags | float_flag_invalid;
1447 if (p.exp < DECOMPOSED_BINARY_POINT) {
1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1452 s->float_exception_flags = orig_flags | float_flag_invalid;
1456 /* For uint64 this will never trip, but if p.exp is too large
1457 * to shift a decomposed fraction we shall have exited via the
1461 s->float_exception_flags = orig_flags | float_flag_invalid;
1468 g_assert_not_reached();
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1478 UINT ## isz ## _MAX, s); \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1482 (float ## fsz a, float_status *s) \
1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1486 UINT ## isz ## _MAX, s); \
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1501 #undef FLOAT_TO_UINT
1504 * Integer to float conversions
1506 * Returns the result of converting the two's complement integer `a'
1507 * to the floating-point format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1515 r.cls = float_class_zero;
1517 } else if (a == (1ULL << 63)) {
1518 r.cls = float_class_normal;
1520 r.frac = DECOMPOSED_IMPLICIT_BIT;
1531 int shift = clz64(f) - 1;
1532 r.cls = float_class_normal;
1533 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534 r.frac = f << shift;
1540 float16 int64_to_float16(int64_t a, float_status *status)
1542 FloatParts pa = int_to_float(a, status);
1543 return float16_round_pack_canonical(pa, status);
1546 float16 int32_to_float16(int32_t a, float_status *status)
1548 return int64_to_float16(a, status);
1551 float16 int16_to_float16(int16_t a, float_status *status)
1553 return int64_to_float16(a, status);
1556 float32 int64_to_float32(int64_t a, float_status *status)
1558 FloatParts pa = int_to_float(a, status);
1559 return float32_round_pack_canonical(pa, status);
1562 float32 int32_to_float32(int32_t a, float_status *status)
1564 return int64_to_float32(a, status);
1567 float32 int16_to_float32(int16_t a, float_status *status)
1569 return int64_to_float32(a, status);
1572 float64 int64_to_float64(int64_t a, float_status *status)
1574 FloatParts pa = int_to_float(a, status);
1575 return float64_round_pack_canonical(pa, status);
1578 float64 int32_to_float64(int32_t a, float_status *status)
1580 return int64_to_float64(a, status);
1583 float64 int16_to_float64(int16_t a, float_status *status)
1585 return int64_to_float64(a, status);
1590 * Unsigned Integer to float conversions
1592 * Returns the result of converting the unsigned integer `a' to the
1593 * floating-point format. The conversion is performed according to the
1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1599 FloatParts r = { .sign = false};
1602 r.cls = float_class_zero;
1604 int spare_bits = clz64(a) - 1;
1605 r.cls = float_class_normal;
1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607 if (spare_bits < 0) {
1608 shift64RightJamming(a, -spare_bits, &a);
1611 r.frac = a << spare_bits;
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1620 FloatParts pa = uint_to_float(a, status);
1621 return float16_round_pack_canonical(pa, status);
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1626 return uint64_to_float16(a, status);
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1631 return uint64_to_float16(a, status);
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1636 FloatParts pa = uint_to_float(a, status);
1637 return float32_round_pack_canonical(pa, status);
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1642 return uint64_to_float32(a, status);
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1647 return uint64_to_float32(a, status);
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1652 FloatParts pa = uint_to_float(a, status);
1653 return float64_round_pack_canonical(pa, status);
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1658 return uint64_to_float64(a, status);
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1663 return uint64_to_float64(a, status);
1667 /* min() and max() functions. These can't be implemented as
1668 * 'compare and pick one input' because that would mishandle
1669 * NaNs and +0 vs -0.
1671 * minnum() and maxnum() functions. These are similar to the min()
1672 * and max() functions but if one of the arguments is a QNaN and
1673 * the other is numerical then the numerical argument is returned.
1674 * SNaNs will get quietened before being returned.
1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676 * and maxNum() operations. min() and max() are the typical min/max
1677 * semantics provided by many CPUs which predate that specification.
1679 * minnummag() and maxnummag() functions correspond to minNumMag()
1680 * and minNumMag() from the IEEE-754 2008.
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683 bool ieee, bool ismag, float_status *s)
1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1687 /* Takes two floating-point values `a' and `b', one of
1688 * which is a NaN, and returns the appropriate NaN
1689 * result. If either `a' or `b' is a signaling NaN,
1690 * the invalid exception is raised.
1692 if (is_snan(a.cls) || is_snan(b.cls)) {
1693 return pick_nan(a, b, s);
1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1700 return pick_nan(a, b, s);
1703 bool a_sign, b_sign;
1706 case float_class_normal:
1709 case float_class_inf:
1712 case float_class_zero:
1716 g_assert_not_reached();
1720 case float_class_normal:
1723 case float_class_inf:
1726 case float_class_zero:
1730 g_assert_not_reached();
1737 a_sign = b_sign = 0;
1740 if (a_sign == b_sign) {
1741 bool a_less = a_exp < b_exp;
1742 if (a_exp == b_exp) {
1743 a_less = a.frac < b.frac;
1745 return a_sign ^ a_less ^ ismin ? b : a;
1747 return a_sign ^ ismin ? b : a;
1752 #define MINMAX(sz, name, ismin, isiee, ismag) \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1760 return float ## sz ## _round_pack_canonical(pr, s); \
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1790 if (is_nan(a.cls) || is_nan(b.cls)) {
1792 a.cls == float_class_snan ||
1793 b.cls == float_class_snan) {
1794 s->float_exception_flags |= float_flag_invalid;
1796 return float_relation_unordered;
1799 if (a.cls == float_class_zero) {
1800 if (b.cls == float_class_zero) {
1801 return float_relation_equal;
1803 return b.sign ? float_relation_greater : float_relation_less;
1804 } else if (b.cls == float_class_zero) {
1805 return a.sign ? float_relation_less : float_relation_greater;
1808 /* The only really important thing about infinity is its sign. If
1809 * both are infinities the sign marks the smallest of the two.
1811 if (a.cls == float_class_inf) {
1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813 return float_relation_equal;
1815 return a.sign ? float_relation_less : float_relation_greater;
1816 } else if (b.cls == float_class_inf) {
1817 return b.sign ? float_relation_greater : float_relation_less;
1820 if (a.sign != b.sign) {
1821 return a.sign ? float_relation_less : float_relation_greater;
1824 if (a.exp == b.exp) {
1825 if (a.frac == b.frac) {
1826 return float_relation_equal;
1829 return a.frac > b.frac ?
1830 float_relation_less : float_relation_greater;
1832 return a.frac > b.frac ?
1833 float_relation_greater : float_relation_less;
1837 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1839 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1844 #define COMPARE(sz) \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1850 return compare_floats(pa, pb, false, s); \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1857 return compare_floats(pa, pb, true, s); \
1866 /* Multiply A by 2 raised to the power N. */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1869 if (unlikely(is_nan(a.cls))) {
1870 return return_nan(a, s);
1872 if (a.cls == float_class_normal) {
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1880 FloatParts pa = float16_unpack_canonical(a, status);
1881 FloatParts pr = scalbn_decomposed(pa, n, status);
1882 return float16_round_pack_canonical(pr, status);
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1887 FloatParts pa = float32_unpack_canonical(a, status);
1888 FloatParts pr = scalbn_decomposed(pa, n, status);
1889 return float32_round_pack_canonical(pr, status);
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1894 FloatParts pa = float64_unpack_canonical(a, status);
1895 FloatParts pr = scalbn_decomposed(pa, n, status);
1896 return float64_round_pack_canonical(pr, status);
1902 * The old softfloat code did an approximation step before zeroing in
1903 * on the final result. However for simpleness we just compute the
1904 * square root by iterating down from the implicit bit to enough extra
1905 * bits to ensure we get a correctly rounded result.
1907 * This does mean however the calculation is slower than before,
1908 * especially for 64 bit floats.
1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1913 uint64_t a_frac, r_frac, s_frac;
1916 if (is_nan(a.cls)) {
1917 return return_nan(a, s);
1919 if (a.cls == float_class_zero) {
1920 return a; /* sqrt(+-0) = +-0 */
1923 s->float_exception_flags |= float_flag_invalid;
1924 a.cls = float_class_dnan;
1927 if (a.cls == float_class_inf) {
1928 return a; /* sqrt(+inf) = +inf */
1931 assert(a.cls == float_class_normal);
1933 /* We need two overflow bits at the top. Adding room for that is a
1934 * right shift. If the exponent is odd, we can discard the low bit
1935 * by multiplying the fraction by 2; that's a left shift. Combine
1936 * those and we shift right if the exponent is even.
1944 /* Bit-by-bit computation of sqrt. */
1948 /* Iterate from implicit bit down to the 3 extra bits to compute a
1949 * properly rounded result. Remember we've inserted one more bit
1950 * at the top, so these positions are one less.
1952 bit = DECOMPOSED_BINARY_POINT - 1;
1953 last_bit = MAX(p->frac_shift - 4, 0);
1955 uint64_t q = 1ULL << bit;
1956 uint64_t t_frac = s_frac + q;
1957 if (t_frac <= a_frac) {
1958 s_frac = t_frac + q;
1963 } while (--bit >= last_bit);
1965 /* Undo the right shift done above. If there is any remaining
1966 * fraction, the result is inexact. Set the sticky bit.
1968 a.frac = (r_frac << 1) + (a_frac != 0);
1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1975 FloatParts pa = float16_unpack_canonical(a, status);
1976 FloatParts pr = sqrt_float(pa, status, &float16_params);
1977 return float16_round_pack_canonical(pr, status);
1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1982 FloatParts pa = float32_unpack_canonical(a, status);
1983 FloatParts pr = sqrt_float(pa, status, &float32_params);
1984 return float32_round_pack_canonical(pr, status);
1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1989 FloatParts pa = float64_unpack_canonical(a, status);
1990 FloatParts pr = sqrt_float(pa, status, &float64_params);
1991 return float64_round_pack_canonical(pr, status);
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input. If `zSign' is 1, the input is negated before being converted to an
1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer. However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2008 int8_t roundingMode;
2009 flag roundNearestEven;
2010 int8_t roundIncrement, roundBits;
2013 roundingMode = status->float_rounding_mode;
2014 roundNearestEven = ( roundingMode == float_round_nearest_even );
2015 switch (roundingMode) {
2016 case float_round_nearest_even:
2017 case float_round_ties_away:
2018 roundIncrement = 0x40;
2020 case float_round_to_zero:
2023 case float_round_up:
2024 roundIncrement = zSign ? 0 : 0x7f;
2026 case float_round_down:
2027 roundIncrement = zSign ? 0x7f : 0;
2032 roundBits = absZ & 0x7F;
2033 absZ = ( absZ + roundIncrement )>>7;
2034 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2036 if ( zSign ) z = - z;
2037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2038 float_raise(float_flag_invalid, status);
2039 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2042 status->float_exception_flags |= float_flag_inexact;
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer. However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2058 *----------------------------------------------------------------------------*/
2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2061 float_status *status)
2063 int8_t roundingMode;
2064 flag roundNearestEven, increment;
2067 roundingMode = status->float_rounding_mode;
2068 roundNearestEven = ( roundingMode == float_round_nearest_even );
2069 switch (roundingMode) {
2070 case float_round_nearest_even:
2071 case float_round_ties_away:
2072 increment = ((int64_t) absZ1 < 0);
2074 case float_round_to_zero:
2077 case float_round_up:
2078 increment = !zSign && absZ1;
2080 case float_round_down:
2081 increment = zSign && absZ1;
2088 if ( absZ0 == 0 ) goto overflow;
2089 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2092 if ( zSign ) z = - z;
2093 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2095 float_raise(float_flag_invalid, status);
2097 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2098 : LIT64( 0x7FFFFFFFFFFFFFFF );
2101 status->float_exception_flags |= float_flag_inexact;
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer. However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2118 uint64_t absZ1, float_status *status)
2120 int8_t roundingMode;
2121 flag roundNearestEven, increment;
2123 roundingMode = status->float_rounding_mode;
2124 roundNearestEven = (roundingMode == float_round_nearest_even);
2125 switch (roundingMode) {
2126 case float_round_nearest_even:
2127 case float_round_ties_away:
2128 increment = ((int64_t)absZ1 < 0);
2130 case float_round_to_zero:
2133 case float_round_up:
2134 increment = !zSign && absZ1;
2136 case float_round_down:
2137 increment = zSign && absZ1;
2145 float_raise(float_flag_invalid, status);
2146 return LIT64(0xFFFFFFFFFFFFFFFF);
2148 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2151 if (zSign && absZ0) {
2152 float_raise(float_flag_invalid, status);
2157 status->float_exception_flags |= float_flag_inexact;
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32 float32_squash_input_denormal(float32 a, float_status *status)
2168 if (status->flush_inputs_to_zero) {
2169 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2170 float_raise(float_flag_input_denormal, status);
2171 return make_float32(float32_val(a) & 0x80000000);
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'. The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2185 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2189 shiftCount = countLeadingZeros32( aSig ) - 8;
2190 *zSigPtr = aSig<<shiftCount;
2191 *zExpPtr = 1 - shiftCount;
2195 /*----------------------------------------------------------------------------
2196 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2197 | single-precision floating-point value, returning the result. After being
2198 | shifted into the proper positions, the three fields are simply added
2199 | together to form the result. This means that any integer portion of `zSig'
2200 | will be added into the exponent. Since a properly normalized significand
2201 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2202 | than the desired result exponent whenever `zSig' is a complete, normalized
2204 *----------------------------------------------------------------------------*/
2206 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
2209 return make_float32(
2210 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
2214 /*----------------------------------------------------------------------------
2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2216 | and significand `zSig', and returns the proper single-precision floating-
2217 | point value corresponding to the abstract input. Ordinarily, the abstract
2218 | value is simply rounded and packed into the single-precision format, with
2219 | the inexact exception raised if the abstract input cannot be represented
2220 | exactly. However, if the abstract value is too large, the overflow and
2221 | inexact exceptions are raised and an infinity or maximal finite value is
2222 | returned. If the abstract value is too small, the input value is rounded to
2223 | a subnormal number, and the underflow and inexact exceptions are raised if
2224 | the abstract input cannot be represented exactly as a subnormal single-
2225 | precision floating-point number.
2226 | The input significand `zSig' has its binary point between bits 30
2227 | and 29, which is 7 bits to the left of the usual location. This shifted
2228 | significand must be normalized or smaller. If `zSig' is not normalized,
2229 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2230 | and it must not require rounding. In the usual case that `zSig' is
2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2233 | Binary Floating-Point Arithmetic.
2234 *----------------------------------------------------------------------------*/
2236 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2237 float_status *status)
2239 int8_t roundingMode;
2240 flag roundNearestEven;
2241 int8_t roundIncrement, roundBits;
2244 roundingMode = status->float_rounding_mode;
2245 roundNearestEven = ( roundingMode == float_round_nearest_even );
2246 switch (roundingMode) {
2247 case float_round_nearest_even:
2248 case float_round_ties_away:
2249 roundIncrement = 0x40;
2251 case float_round_to_zero:
2254 case float_round_up:
2255 roundIncrement = zSign ? 0 : 0x7f;
2257 case float_round_down:
2258 roundIncrement = zSign ? 0x7f : 0;
2264 roundBits = zSig & 0x7F;
2265 if ( 0xFD <= (uint16_t) zExp ) {
2266 if ( ( 0xFD < zExp )
2267 || ( ( zExp == 0xFD )
2268 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2270 float_raise(float_flag_overflow | float_flag_inexact, status);
2271 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2274 if (status->flush_to_zero) {
2275 float_raise(float_flag_output_denormal, status);
2276 return packFloat32(zSign, 0, 0);
2279 (status->float_detect_tininess
2280 == float_tininess_before_rounding)
2282 || ( zSig + roundIncrement < 0x80000000 );
2283 shift32RightJamming( zSig, - zExp, &zSig );
2285 roundBits = zSig & 0x7F;
2286 if (isTiny && roundBits) {
2287 float_raise(float_flag_underflow, status);
2292 status->float_exception_flags |= float_flag_inexact;
2294 zSig = ( zSig + roundIncrement )>>7;
2295 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296 if ( zSig == 0 ) zExp = 0;
2297 return packFloat32( zSign, zExp, zSig );
2301 /*----------------------------------------------------------------------------
2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2303 | and significand `zSig', and returns the proper single-precision floating-
2304 | point value corresponding to the abstract input. This routine is just like
2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2307 | floating-point exponent.
2308 *----------------------------------------------------------------------------*/
2311 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2312 float_status *status)
2316 shiftCount = countLeadingZeros32( zSig ) - 1;
2317 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2322 /*----------------------------------------------------------------------------
2323 | If `a' is denormal and we are in flush-to-zero mode then set the
2324 | input-denormal exception and return zero. Otherwise just return the value.
2325 *----------------------------------------------------------------------------*/
2326 float64 float64_squash_input_denormal(float64 a, float_status *status)
2328 if (status->flush_inputs_to_zero) {
2329 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2330 float_raise(float_flag_input_denormal, status);
2331 return make_float64(float64_val(a) & (1ULL << 63));
2337 /*----------------------------------------------------------------------------
2338 | Normalizes the subnormal double-precision floating-point value represented
2339 | by the denormalized significand `aSig'. The normalized exponent and
2340 | significand are stored at the locations pointed to by `zExpPtr' and
2341 | `zSigPtr', respectively.
2342 *----------------------------------------------------------------------------*/
2345 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2349 shiftCount = countLeadingZeros64( aSig ) - 11;
2350 *zSigPtr = aSig<<shiftCount;
2351 *zExpPtr = 1 - shiftCount;
2355 /*----------------------------------------------------------------------------
2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2357 | double-precision floating-point value, returning the result. After being
2358 | shifted into the proper positions, the three fields are simply added
2359 | together to form the result. This means that any integer portion of `zSig'
2360 | will be added into the exponent. Since a properly normalized significand
2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2362 | than the desired result exponent whenever `zSig' is a complete, normalized
2364 *----------------------------------------------------------------------------*/
2366 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2369 return make_float64(
2370 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2374 /*----------------------------------------------------------------------------
2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2376 | and significand `zSig', and returns the proper double-precision floating-
2377 | point value corresponding to the abstract input. Ordinarily, the abstract
2378 | value is simply rounded and packed into the double-precision format, with
2379 | the inexact exception raised if the abstract input cannot be represented
2380 | exactly. However, if the abstract value is too large, the overflow and
2381 | inexact exceptions are raised and an infinity or maximal finite value is
2382 | returned. If the abstract value is too small, the input value is rounded to
2383 | a subnormal number, and the underflow and inexact exceptions are raised if
2384 | the abstract input cannot be represented exactly as a subnormal double-
2385 | precision floating-point number.
2386 | The input significand `zSig' has its binary point between bits 62
2387 | and 61, which is 10 bits to the left of the usual location. This shifted
2388 | significand must be normalized or smaller. If `zSig' is not normalized,
2389 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2390 | and it must not require rounding. In the usual case that `zSig' is
2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2393 | Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2396 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2397 float_status *status)
2399 int8_t roundingMode;
2400 flag roundNearestEven;
2401 int roundIncrement, roundBits;
2404 roundingMode = status->float_rounding_mode;
2405 roundNearestEven = ( roundingMode == float_round_nearest_even );
2406 switch (roundingMode) {
2407 case float_round_nearest_even:
2408 case float_round_ties_away:
2409 roundIncrement = 0x200;
2411 case float_round_to_zero:
2414 case float_round_up:
2415 roundIncrement = zSign ? 0 : 0x3ff;
2417 case float_round_down:
2418 roundIncrement = zSign ? 0x3ff : 0;
2420 case float_round_to_odd:
2421 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2426 roundBits = zSig & 0x3FF;
2427 if ( 0x7FD <= (uint16_t) zExp ) {
2428 if ( ( 0x7FD < zExp )
2429 || ( ( zExp == 0x7FD )
2430 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2432 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2433 roundIncrement != 0;
2434 float_raise(float_flag_overflow | float_flag_inexact, status);
2435 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2438 if (status->flush_to_zero) {
2439 float_raise(float_flag_output_denormal, status);
2440 return packFloat64(zSign, 0, 0);
2443 (status->float_detect_tininess
2444 == float_tininess_before_rounding)
2446 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2447 shift64RightJamming( zSig, - zExp, &zSig );
2449 roundBits = zSig & 0x3FF;
2450 if (isTiny && roundBits) {
2451 float_raise(float_flag_underflow, status);
2453 if (roundingMode == float_round_to_odd) {
2455 * For round-to-odd case, the roundIncrement depends on
2456 * zSig which just changed.
2458 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2463 status->float_exception_flags |= float_flag_inexact;
2465 zSig = ( zSig + roundIncrement )>>10;
2466 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2467 if ( zSig == 0 ) zExp = 0;
2468 return packFloat64( zSign, zExp, zSig );
2472 /*----------------------------------------------------------------------------
2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2474 | and significand `zSig', and returns the proper double-precision floating-
2475 | point value corresponding to the abstract input. This routine is just like
2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2478 | floating-point exponent.
2479 *----------------------------------------------------------------------------*/
2482 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2483 float_status *status)
2487 shiftCount = countLeadingZeros64( zSig ) - 1;
2488 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2493 /*----------------------------------------------------------------------------
2494 | Returns the fraction bits of the extended double-precision floating-point
2496 *----------------------------------------------------------------------------*/
2498 static inline uint64_t extractFloatx80Frac( floatx80 a )
2505 /*----------------------------------------------------------------------------
2506 | Returns the exponent bits of the extended double-precision floating-point
2508 *----------------------------------------------------------------------------*/
2510 static inline int32_t extractFloatx80Exp( floatx80 a )
2513 return a.high & 0x7FFF;
2517 /*----------------------------------------------------------------------------
2518 | Returns the sign bit of the extended double-precision floating-point value
2520 *----------------------------------------------------------------------------*/
2522 static inline flag extractFloatx80Sign( floatx80 a )
2529 /*----------------------------------------------------------------------------
2530 | Normalizes the subnormal extended double-precision floating-point value
2531 | represented by the denormalized significand `aSig'. The normalized exponent
2532 | and significand are stored at the locations pointed to by `zExpPtr' and
2533 | `zSigPtr', respectively.
2534 *----------------------------------------------------------------------------*/
2537 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2541 shiftCount = countLeadingZeros64( aSig );
2542 *zSigPtr = aSig<<shiftCount;
2543 *zExpPtr = 1 - shiftCount;
2547 /*----------------------------------------------------------------------------
2548 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2549 | extended double-precision floating-point value, returning the result.
2550 *----------------------------------------------------------------------------*/
2552 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2557 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2562 /*----------------------------------------------------------------------------
2563 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2564 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2565 | and returns the proper extended double-precision floating-point value
2566 | corresponding to the abstract input. Ordinarily, the abstract value is
2567 | rounded and packed into the extended double-precision format, with the
2568 | inexact exception raised if the abstract input cannot be represented
2569 | exactly. However, if the abstract value is too large, the overflow and
2570 | inexact exceptions are raised and an infinity or maximal finite value is
2571 | returned. If the abstract value is too small, the input value is rounded to
2572 | a subnormal number, and the underflow and inexact exceptions are raised if
2573 | the abstract input cannot be represented exactly as a subnormal extended
2574 | double-precision floating-point number.
2575 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2576 | number of bits as single or double precision, respectively. Otherwise, the
2577 | result is rounded to the full precision of the extended double-precision
2579 | The input significand must be normalized or smaller. If the input
2580 | significand is not normalized, `zExp' must be 0; in that case, the result
2581 | returned is a subnormal number, and it must not require rounding. The
2582 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2583 | Floating-Point Arithmetic.
2584 *----------------------------------------------------------------------------*/
2586 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2587 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2588 float_status *status)
2590 int8_t roundingMode;
2591 flag roundNearestEven, increment, isTiny;
2592 int64_t roundIncrement, roundMask, roundBits;
2594 roundingMode = status->float_rounding_mode;
2595 roundNearestEven = ( roundingMode == float_round_nearest_even );
2596 if ( roundingPrecision == 80 ) goto precision80;
2597 if ( roundingPrecision == 64 ) {
2598 roundIncrement = LIT64( 0x0000000000000400 );
2599 roundMask = LIT64( 0x00000000000007FF );
2601 else if ( roundingPrecision == 32 ) {
2602 roundIncrement = LIT64( 0x0000008000000000 );
2603 roundMask = LIT64( 0x000000FFFFFFFFFF );
2608 zSig0 |= ( zSig1 != 0 );
2609 switch (roundingMode) {
2610 case float_round_nearest_even:
2611 case float_round_ties_away:
2613 case float_round_to_zero:
2616 case float_round_up:
2617 roundIncrement = zSign ? 0 : roundMask;
2619 case float_round_down:
2620 roundIncrement = zSign ? roundMask : 0;
2625 roundBits = zSig0 & roundMask;
2626 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2627 if ( ( 0x7FFE < zExp )
2628 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2633 if (status->flush_to_zero) {
2634 float_raise(float_flag_output_denormal, status);
2635 return packFloatx80(zSign, 0, 0);
2638 (status->float_detect_tininess
2639 == float_tininess_before_rounding)
2641 || ( zSig0 <= zSig0 + roundIncrement );
2642 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2644 roundBits = zSig0 & roundMask;
2645 if (isTiny && roundBits) {
2646 float_raise(float_flag_underflow, status);
2649 status->float_exception_flags |= float_flag_inexact;
2651 zSig0 += roundIncrement;
2652 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2653 roundIncrement = roundMask + 1;
2654 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2655 roundMask |= roundIncrement;
2657 zSig0 &= ~ roundMask;
2658 return packFloatx80( zSign, zExp, zSig0 );
2662 status->float_exception_flags |= float_flag_inexact;
2664 zSig0 += roundIncrement;
2665 if ( zSig0 < roundIncrement ) {
2667 zSig0 = LIT64( 0x8000000000000000 );
2669 roundIncrement = roundMask + 1;
2670 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2671 roundMask |= roundIncrement;
2673 zSig0 &= ~ roundMask;
2674 if ( zSig0 == 0 ) zExp = 0;
2675 return packFloatx80( zSign, zExp, zSig0 );
2677 switch (roundingMode) {
2678 case float_round_nearest_even:
2679 case float_round_ties_away:
2680 increment = ((int64_t)zSig1 < 0);
2682 case float_round_to_zero:
2685 case float_round_up:
2686 increment = !zSign && zSig1;
2688 case float_round_down:
2689 increment = zSign && zSig1;
2694 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2695 if ( ( 0x7FFE < zExp )
2696 || ( ( zExp == 0x7FFE )
2697 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2703 float_raise(float_flag_overflow | float_flag_inexact, status);
2704 if ( ( roundingMode == float_round_to_zero )
2705 || ( zSign && ( roundingMode == float_round_up ) )
2706 || ( ! zSign && ( roundingMode == float_round_down ) )
2708 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2710 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2714 (status->float_detect_tininess
2715 == float_tininess_before_rounding)
2718 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2719 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2721 if (isTiny && zSig1) {
2722 float_raise(float_flag_underflow, status);
2725 status->float_exception_flags |= float_flag_inexact;
2727 switch (roundingMode) {
2728 case float_round_nearest_even:
2729 case float_round_ties_away:
2730 increment = ((int64_t)zSig1 < 0);
2732 case float_round_to_zero:
2735 case float_round_up:
2736 increment = !zSign && zSig1;
2738 case float_round_down:
2739 increment = zSign && zSig1;
2747 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2748 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2750 return packFloatx80( zSign, zExp, zSig0 );
2754 status->float_exception_flags |= float_flag_inexact;
2760 zSig0 = LIT64( 0x8000000000000000 );
2763 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2767 if ( zSig0 == 0 ) zExp = 0;
2769 return packFloatx80( zSign, zExp, zSig0 );
2773 /*----------------------------------------------------------------------------
2774 | Takes an abstract floating-point value having sign `zSign', exponent
2775 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2776 | and returns the proper extended double-precision floating-point value
2777 | corresponding to the abstract input. This routine is just like
2778 | `roundAndPackFloatx80' except that the input significand does not have to be
2780 *----------------------------------------------------------------------------*/
2782 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2783 flag zSign, int32_t zExp,
2784 uint64_t zSig0, uint64_t zSig1,
2785 float_status *status)
2794 shiftCount = countLeadingZeros64( zSig0 );
2795 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2797 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2798 zSig0, zSig1, status);
2802 /*----------------------------------------------------------------------------
2803 | Returns the least-significant 64 fraction bits of the quadruple-precision
2804 | floating-point value `a'.
2805 *----------------------------------------------------------------------------*/
2807 static inline uint64_t extractFloat128Frac1( float128 a )
2814 /*----------------------------------------------------------------------------
2815 | Returns the most-significant 48 fraction bits of the quadruple-precision
2816 | floating-point value `a'.
2817 *----------------------------------------------------------------------------*/
2819 static inline uint64_t extractFloat128Frac0( float128 a )
2822 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2826 /*----------------------------------------------------------------------------
2827 | Returns the exponent bits of the quadruple-precision floating-point value
2829 *----------------------------------------------------------------------------*/
2831 static inline int32_t extractFloat128Exp( float128 a )
2834 return ( a.high>>48 ) & 0x7FFF;
2838 /*----------------------------------------------------------------------------
2839 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2840 *----------------------------------------------------------------------------*/
2842 static inline flag extractFloat128Sign( float128 a )
2849 /*----------------------------------------------------------------------------
2850 | Normalizes the subnormal quadruple-precision floating-point value
2851 | represented by the denormalized significand formed by the concatenation of
2852 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2853 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2854 | significand are stored at the location pointed to by `zSig0Ptr', and the
2855 | least significant 64 bits of the normalized significand are stored at the
2856 | location pointed to by `zSig1Ptr'.
2857 *----------------------------------------------------------------------------*/
2860 normalizeFloat128Subnormal(
2871 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2872 if ( shiftCount < 0 ) {
2873 *zSig0Ptr = aSig1>>( - shiftCount );
2874 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2877 *zSig0Ptr = aSig1<<shiftCount;
2880 *zExpPtr = - shiftCount - 63;
2883 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2884 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2885 *zExpPtr = 1 - shiftCount;
2890 /*----------------------------------------------------------------------------
2891 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2892 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2893 | floating-point value, returning the result. After being shifted into the
2894 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2895 | added together to form the most significant 32 bits of the result. This
2896 | means that any integer portion of `zSig0' will be added into the exponent.
2897 | Since a properly normalized significand will have an integer portion equal
2898 | to 1, the `zExp' input should be 1 less than the desired result exponent
2899 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2901 *----------------------------------------------------------------------------*/
2903 static inline float128
2904 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2909 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2914 /*----------------------------------------------------------------------------
2915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2916 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2917 | and `zSig2', and returns the proper quadruple-precision floating-point value
2918 | corresponding to the abstract input. Ordinarily, the abstract value is
2919 | simply rounded and packed into the quadruple-precision format, with the
2920 | inexact exception raised if the abstract input cannot be represented
2921 | exactly. However, if the abstract value is too large, the overflow and
2922 | inexact exceptions are raised and an infinity or maximal finite value is
2923 | returned. If the abstract value is too small, the input value is rounded to
2924 | a subnormal number, and the underflow and inexact exceptions are raised if
2925 | the abstract input cannot be represented exactly as a subnormal quadruple-
2926 | precision floating-point number.
2927 | The input significand must be normalized or smaller. If the input
2928 | significand is not normalized, `zExp' must be 0; in that case, the result
2929 | returned is a subnormal number, and it must not require rounding. In the
2930 | usual case that the input significand is normalized, `zExp' must be 1 less
2931 | than the ``true'' floating-point exponent. The handling of underflow and
2932 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2933 *----------------------------------------------------------------------------*/
2935 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2936 uint64_t zSig0, uint64_t zSig1,
2937 uint64_t zSig2, float_status *status)
2939 int8_t roundingMode;
2940 flag roundNearestEven, increment, isTiny;
2942 roundingMode = status->float_rounding_mode;
2943 roundNearestEven = ( roundingMode == float_round_nearest_even );
2944 switch (roundingMode) {
2945 case float_round_nearest_even:
2946 case float_round_ties_away:
2947 increment = ((int64_t)zSig2 < 0);
2949 case float_round_to_zero:
2952 case float_round_up:
2953 increment = !zSign && zSig2;
2955 case float_round_down:
2956 increment = zSign && zSig2;
2958 case float_round_to_odd:
2959 increment = !(zSig1 & 0x1) && zSig2;
2964 if ( 0x7FFD <= (uint32_t) zExp ) {
2965 if ( ( 0x7FFD < zExp )
2966 || ( ( zExp == 0x7FFD )
2968 LIT64( 0x0001FFFFFFFFFFFF ),
2969 LIT64( 0xFFFFFFFFFFFFFFFF ),
2976 float_raise(float_flag_overflow | float_flag_inexact, status);
2977 if ( ( roundingMode == float_round_to_zero )
2978 || ( zSign && ( roundingMode == float_round_up ) )
2979 || ( ! zSign && ( roundingMode == float_round_down ) )
2980 || (roundingMode == float_round_to_odd)
2986 LIT64( 0x0000FFFFFFFFFFFF ),
2987 LIT64( 0xFFFFFFFFFFFFFFFF )
2990 return packFloat128( zSign, 0x7FFF, 0, 0 );
2993 if (status->flush_to_zero) {
2994 float_raise(float_flag_output_denormal, status);
2995 return packFloat128(zSign, 0, 0, 0);
2998 (status->float_detect_tininess
2999 == float_tininess_before_rounding)
3005 LIT64( 0x0001FFFFFFFFFFFF ),
3006 LIT64( 0xFFFFFFFFFFFFFFFF )
3008 shift128ExtraRightJamming(
3009 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3011 if (isTiny && zSig2) {
3012 float_raise(float_flag_underflow, status);
3014 switch (roundingMode) {
3015 case float_round_nearest_even:
3016 case float_round_ties_away:
3017 increment = ((int64_t)zSig2 < 0);
3019 case float_round_to_zero:
3022 case float_round_up:
3023 increment = !zSign && zSig2;
3025 case float_round_down:
3026 increment = zSign && zSig2;
3028 case float_round_to_odd:
3029 increment = !(zSig1 & 0x1) && zSig2;
3037 status->float_exception_flags |= float_flag_inexact;
3040 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3041 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3044 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3046 return packFloat128( zSign, zExp, zSig0, zSig1 );
3050 /*----------------------------------------------------------------------------
3051 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3052 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3053 | returns the proper quadruple-precision floating-point value corresponding
3054 | to the abstract input. This routine is just like `roundAndPackFloat128'
3055 | except that the input significand has fewer bits and does not have to be
3056 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3058 *----------------------------------------------------------------------------*/
3060 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3061 uint64_t zSig0, uint64_t zSig1,
3062 float_status *status)
3072 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3073 if ( 0 <= shiftCount ) {
3075 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3078 shift128ExtraRightJamming(
3079 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3082 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3087 /*----------------------------------------------------------------------------
3088 | Returns the result of converting the 32-bit two's complement integer `a'
3089 | to the extended double-precision floating-point format. The conversion
3090 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3092 *----------------------------------------------------------------------------*/
3094 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3101 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3103 absA = zSign ? - a : a;
3104 shiftCount = countLeadingZeros32( absA ) + 32;
3106 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3110 /*----------------------------------------------------------------------------
3111 | Returns the result of converting the 32-bit two's complement integer `a' to
3112 | the quadruple-precision floating-point format. The conversion is performed
3113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3114 *----------------------------------------------------------------------------*/
3116 float128 int32_to_float128(int32_t a, float_status *status)
3123 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3125 absA = zSign ? - a : a;
3126 shiftCount = countLeadingZeros32( absA ) + 17;
3128 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3132 /*----------------------------------------------------------------------------
3133 | Returns the result of converting the 64-bit two's complement integer `a'
3134 | to the extended double-precision floating-point format. The conversion
3135 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3137 *----------------------------------------------------------------------------*/
3139 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3145 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3147 absA = zSign ? - a : a;
3148 shiftCount = countLeadingZeros64( absA );
3149 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3153 /*----------------------------------------------------------------------------
3154 | Returns the result of converting the 64-bit two's complement integer `a' to
3155 | the quadruple-precision floating-point format. The conversion is performed
3156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3157 *----------------------------------------------------------------------------*/
3159 float128 int64_to_float128(int64_t a, float_status *status)
3165 uint64_t zSig0, zSig1;
3167 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3169 absA = zSign ? - a : a;
3170 shiftCount = countLeadingZeros64( absA ) + 49;
3171 zExp = 0x406E - shiftCount;
3172 if ( 64 <= shiftCount ) {
3181 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3182 return packFloat128( zSign, zExp, zSig0, zSig1 );
3186 /*----------------------------------------------------------------------------
3187 | Returns the result of converting the 64-bit unsigned integer `a'
3188 | to the quadruple-precision floating-point format. The conversion is performed
3189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3190 *----------------------------------------------------------------------------*/
3192 float128 uint64_to_float128(uint64_t a, float_status *status)
3195 return float128_zero;
3197 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3203 /*----------------------------------------------------------------------------
3204 | Returns the result of converting the single-precision floating-point value
3205 | `a' to the double-precision floating-point format. The conversion is
3206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3208 *----------------------------------------------------------------------------*/
3210 float64 float32_to_float64(float32 a, float_status *status)
3215 a = float32_squash_input_denormal(a, status);
3217 aSig = extractFloat32Frac( a );
3218 aExp = extractFloat32Exp( a );
3219 aSign = extractFloat32Sign( a );
3220 if ( aExp == 0xFF ) {
3222 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3224 return packFloat64( aSign, 0x7FF, 0 );
3227 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3228 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3231 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3235 /*----------------------------------------------------------------------------
3236 | Returns the result of converting the single-precision floating-point value
3237 | `a' to the extended double-precision floating-point format. The conversion
3238 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3240 *----------------------------------------------------------------------------*/
3242 floatx80 float32_to_floatx80(float32 a, float_status *status)
3248 a = float32_squash_input_denormal(a, status);
3249 aSig = extractFloat32Frac( a );
3250 aExp = extractFloat32Exp( a );
3251 aSign = extractFloat32Sign( a );
3252 if ( aExp == 0xFF ) {
3254 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3256 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3259 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3260 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3263 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3267 /*----------------------------------------------------------------------------
3268 | Returns the result of converting the single-precision floating-point value
3269 | `a' to the double-precision floating-point format. The conversion is
3270 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3272 *----------------------------------------------------------------------------*/
3274 float128 float32_to_float128(float32 a, float_status *status)
3280 a = float32_squash_input_denormal(a, status);
3281 aSig = extractFloat32Frac( a );
3282 aExp = extractFloat32Exp( a );
3283 aSign = extractFloat32Sign( a );
3284 if ( aExp == 0xFF ) {
3286 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3288 return packFloat128( aSign, 0x7FFF, 0, 0 );
3291 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3292 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3295 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3299 /*----------------------------------------------------------------------------
3300 | Returns the remainder of the single-precision floating-point value `a'
3301 | with respect to the corresponding value `b'. The operation is performed
3302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3303 *----------------------------------------------------------------------------*/
3305 float32 float32_rem(float32 a, float32 b, float_status *status)
3308 int aExp, bExp, expDiff;
3309 uint32_t aSig, bSig;
3311 uint64_t aSig64, bSig64, q64;
3312 uint32_t alternateASig;
3314 a = float32_squash_input_denormal(a, status);
3315 b = float32_squash_input_denormal(b, status);
3317 aSig = extractFloat32Frac( a );
3318 aExp = extractFloat32Exp( a );
3319 aSign = extractFloat32Sign( a );
3320 bSig = extractFloat32Frac( b );
3321 bExp = extractFloat32Exp( b );
3322 if ( aExp == 0xFF ) {
3323 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3324 return propagateFloat32NaN(a, b, status);
3326 float_raise(float_flag_invalid, status);
3327 return float32_default_nan(status);
3329 if ( bExp == 0xFF ) {
3331 return propagateFloat32NaN(a, b, status);
3337 float_raise(float_flag_invalid, status);
3338 return float32_default_nan(status);
3340 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3343 if ( aSig == 0 ) return a;
3344 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3346 expDiff = aExp - bExp;
3349 if ( expDiff < 32 ) {
3352 if ( expDiff < 0 ) {
3353 if ( expDiff < -1 ) return a;
3356 q = ( bSig <= aSig );
3357 if ( q ) aSig -= bSig;
3358 if ( 0 < expDiff ) {
3359 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3362 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3370 if ( bSig <= aSig ) aSig -= bSig;
3371 aSig64 = ( (uint64_t) aSig )<<40;
3372 bSig64 = ( (uint64_t) bSig )<<40;
3374 while ( 0 < expDiff ) {
3375 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3376 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3377 aSig64 = - ( ( bSig * q64 )<<38 );
3381 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3382 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3383 q = q64>>( 64 - expDiff );
3385 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3388 alternateASig = aSig;
3391 } while ( 0 <= (int32_t) aSig );
3392 sigMean = aSig + alternateASig;
3393 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3394 aSig = alternateASig;
3396 zSign = ( (int32_t) aSig < 0 );
3397 if ( zSign ) aSig = - aSig;
3398 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3403 /*----------------------------------------------------------------------------
3404 | Returns the binary exponential of the single-precision floating-point value
3405 | `a'. The operation is performed according to the IEC/IEEE Standard for
3406 | Binary Floating-Point Arithmetic.
3408 | Uses the following identities:
3410 | 1. -------------------------------------------------------------------------
3414 | 2. -------------------------------------------------------------------------
3417 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3419 *----------------------------------------------------------------------------*/
3421 static const float64 float32_exp2_coefficients[15] =
3423 const_float64( 0x3ff0000000000000ll ), /* 1 */
3424 const_float64( 0x3fe0000000000000ll ), /* 2 */
3425 const_float64( 0x3fc5555555555555ll ), /* 3 */
3426 const_float64( 0x3fa5555555555555ll ), /* 4 */
3427 const_float64( 0x3f81111111111111ll ), /* 5 */
3428 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3429 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3430 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3431 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3432 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3433 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3434 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3435 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3436 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3437 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3440 float32 float32_exp2(float32 a, float_status *status)
3447 a = float32_squash_input_denormal(a, status);
3449 aSig = extractFloat32Frac( a );
3450 aExp = extractFloat32Exp( a );
3451 aSign = extractFloat32Sign( a );
3453 if ( aExp == 0xFF) {
3455 return propagateFloat32NaN(a, float32_zero, status);
3457 return (aSign) ? float32_zero : a;
3460 if (aSig == 0) return float32_one;
3463 float_raise(float_flag_inexact, status);
3465 /* ******************************* */
3466 /* using float64 for approximation */
3467 /* ******************************* */
3468 x = float32_to_float64(a, status);
3469 x = float64_mul(x, float64_ln2, status);
3473 for (i = 0 ; i < 15 ; i++) {
3476 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3477 r = float64_add(r, f, status);
3479 xn = float64_mul(xn, x, status);
3482 return float64_to_float32(r, status);
3485 /*----------------------------------------------------------------------------
3486 | Returns the binary log of the single-precision floating-point value `a'.
3487 | The operation is performed according to the IEC/IEEE Standard for Binary
3488 | Floating-Point Arithmetic.
3489 *----------------------------------------------------------------------------*/
3490 float32 float32_log2(float32 a, float_status *status)
3494 uint32_t aSig, zSig, i;
3496 a = float32_squash_input_denormal(a, status);
3497 aSig = extractFloat32Frac( a );
3498 aExp = extractFloat32Exp( a );
3499 aSign = extractFloat32Sign( a );
3502 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3503 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3506 float_raise(float_flag_invalid, status);
3507 return float32_default_nan(status);
3509 if ( aExp == 0xFF ) {
3511 return propagateFloat32NaN(a, float32_zero, status);
3521 for (i = 1 << 22; i > 0; i >>= 1) {
3522 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3523 if ( aSig & 0x01000000 ) {
3532 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3535 /*----------------------------------------------------------------------------
3536 | Returns 1 if the single-precision floating-point value `a' is equal to
3537 | the corresponding value `b', and 0 otherwise. The invalid exception is
3538 | raised if either operand is a NaN. Otherwise, the comparison is performed
3539 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540 *----------------------------------------------------------------------------*/
3542 int float32_eq(float32 a, float32 b, float_status *status)
3545 a = float32_squash_input_denormal(a, status);
3546 b = float32_squash_input_denormal(b, status);
3548 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3549 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551 float_raise(float_flag_invalid, status);
3554 av = float32_val(a);
3555 bv = float32_val(b);
3556 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3559 /*----------------------------------------------------------------------------
3560 | Returns 1 if the single-precision floating-point value `a' is less than
3561 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3562 | exception is raised if either operand is a NaN. The comparison is performed
3563 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3564 *----------------------------------------------------------------------------*/
3566 int float32_le(float32 a, float32 b, float_status *status)
3570 a = float32_squash_input_denormal(a, status);
3571 b = float32_squash_input_denormal(b, status);
3573 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3574 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3576 float_raise(float_flag_invalid, status);
3579 aSign = extractFloat32Sign( a );
3580 bSign = extractFloat32Sign( b );
3581 av = float32_val(a);
3582 bv = float32_val(b);
3583 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3584 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3588 /*----------------------------------------------------------------------------
3589 | Returns 1 if the single-precision floating-point value `a' is less than
3590 | the corresponding value `b', and 0 otherwise. The invalid exception is
3591 | raised if either operand is a NaN. The comparison is performed according
3592 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3593 *----------------------------------------------------------------------------*/
3595 int float32_lt(float32 a, float32 b, float_status *status)
3599 a = float32_squash_input_denormal(a, status);
3600 b = float32_squash_input_denormal(b, status);
3602 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3603 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3605 float_raise(float_flag_invalid, status);
3608 aSign = extractFloat32Sign( a );
3609 bSign = extractFloat32Sign( b );
3610 av = float32_val(a);
3611 bv = float32_val(b);
3612 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3613 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3617 /*----------------------------------------------------------------------------
3618 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3619 | be compared, and 0 otherwise. The invalid exception is raised if either
3620 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3621 | Standard for Binary Floating-Point Arithmetic.
3622 *----------------------------------------------------------------------------*/
3624 int float32_unordered(float32 a, float32 b, float_status *status)
3626 a = float32_squash_input_denormal(a, status);
3627 b = float32_squash_input_denormal(b, status);
3629 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3630 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3632 float_raise(float_flag_invalid, status);
3638 /*----------------------------------------------------------------------------
3639 | Returns 1 if the single-precision floating-point value `a' is equal to
3640 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3641 | exception. The comparison is performed according to the IEC/IEEE Standard
3642 | for Binary Floating-Point Arithmetic.
3643 *----------------------------------------------------------------------------*/
3645 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3647 a = float32_squash_input_denormal(a, status);
3648 b = float32_squash_input_denormal(b, status);
3650 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3651 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3653 if (float32_is_signaling_nan(a, status)
3654 || float32_is_signaling_nan(b, status)) {
3655 float_raise(float_flag_invalid, status);
3659 return ( float32_val(a) == float32_val(b) ) ||
3660 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3663 /*----------------------------------------------------------------------------
3664 | Returns 1 if the single-precision floating-point value `a' is less than or
3665 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3666 | cause an exception. Otherwise, the comparison is performed according to the
3667 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3668 *----------------------------------------------------------------------------*/
3670 int float32_le_quiet(float32 a, float32 b, float_status *status)
3674 a = float32_squash_input_denormal(a, status);
3675 b = float32_squash_input_denormal(b, status);
3677 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3678 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3680 if (float32_is_signaling_nan(a, status)
3681 || float32_is_signaling_nan(b, status)) {
3682 float_raise(float_flag_invalid, status);
3686 aSign = extractFloat32Sign( a );
3687 bSign = extractFloat32Sign( b );
3688 av = float32_val(a);
3689 bv = float32_val(b);
3690 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3691 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3695 /*----------------------------------------------------------------------------
3696 | Returns 1 if the single-precision floating-point value `a' is less than
3697 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3698 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3699 | Standard for Binary Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3702 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3706 a = float32_squash_input_denormal(a, status);
3707 b = float32_squash_input_denormal(b, status);
3709 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3710 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3712 if (float32_is_signaling_nan(a, status)
3713 || float32_is_signaling_nan(b, status)) {
3714 float_raise(float_flag_invalid, status);
3718 aSign = extractFloat32Sign( a );
3719 bSign = extractFloat32Sign( b );
3720 av = float32_val(a);
3721 bv = float32_val(b);
3722 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3723 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3727 /*----------------------------------------------------------------------------
3728 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3729 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3730 | comparison is performed according to the IEC/IEEE Standard for Binary
3731 | Floating-Point Arithmetic.
3732 *----------------------------------------------------------------------------*/
3734 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3736 a = float32_squash_input_denormal(a, status);
3737 b = float32_squash_input_denormal(b, status);
3739 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3740 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3742 if (float32_is_signaling_nan(a, status)
3743 || float32_is_signaling_nan(b, status)) {
3744 float_raise(float_flag_invalid, status);
3752 /*----------------------------------------------------------------------------
3753 | Returns the result of converting the double-precision floating-point value
3754 | `a' to the single-precision floating-point format. The conversion is
3755 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3757 *----------------------------------------------------------------------------*/
3759 float32 float64_to_float32(float64 a, float_status *status)
3765 a = float64_squash_input_denormal(a, status);
3767 aSig = extractFloat64Frac( a );
3768 aExp = extractFloat64Exp( a );
3769 aSign = extractFloat64Sign( a );
3770 if ( aExp == 0x7FF ) {
3772 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3774 return packFloat32( aSign, 0xFF, 0 );
3776 shift64RightJamming( aSig, 22, &aSig );
3778 if ( aExp || zSig ) {
3782 return roundAndPackFloat32(aSign, aExp, zSig, status);
3787 /*----------------------------------------------------------------------------
3788 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3789 | half-precision floating-point value, returning the result. After being
3790 | shifted into the proper positions, the three fields are simply added
3791 | together to form the result. This means that any integer portion of `zSig'
3792 | will be added into the exponent. Since a properly normalized significand
3793 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3794 | than the desired result exponent whenever `zSig' is a complete, normalized
3796 *----------------------------------------------------------------------------*/
3797 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3799 return make_float16(
3800 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3803 /*----------------------------------------------------------------------------
3804 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3805 | and significand `zSig', and returns the proper half-precision floating-
3806 | point value corresponding to the abstract input. Ordinarily, the abstract
3807 | value is simply rounded and packed into the half-precision format, with
3808 | the inexact exception raised if the abstract input cannot be represented
3809 | exactly. However, if the abstract value is too large, the overflow and
3810 | inexact exceptions are raised and an infinity or maximal finite value is
3811 | returned. If the abstract value is too small, the input value is rounded to
3812 | a subnormal number, and the underflow and inexact exceptions are raised if
3813 | the abstract input cannot be represented exactly as a subnormal half-
3814 | precision floating-point number.
3815 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3816 | ARM-style "alternative representation", which omits the NaN and Inf
3817 | encodings in order to raise the maximum representable exponent by one.
3818 | The input significand `zSig' has its binary point between bits 22
3819 | and 23, which is 13 bits to the left of the usual location. This shifted
3820 | significand must be normalized or smaller. If `zSig' is not normalized,
3821 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3822 | and it must not require rounding. In the usual case that `zSig' is
3823 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3824 | Note the slightly odd position of the binary point in zSig compared with the
3825 | other roundAndPackFloat functions. This should probably be fixed if we
3826 | need to implement more float16 routines than just conversion.
3827 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3828 | Binary Floating-Point Arithmetic.
3829 *----------------------------------------------------------------------------*/
3831 static float16 roundAndPackFloat16(flag zSign, int zExp,
3832 uint32_t zSig, flag ieee,
3833 float_status *status)
3835 int maxexp = ieee ? 29 : 30;
3838 bool rounding_bumps_exp;
3839 bool is_tiny = false;
3841 /* Calculate the mask of bits of the mantissa which are not
3842 * representable in half-precision and will be lost.
3845 /* Will be denormal in halfprec */
3851 /* Normal number in halfprec */
3855 switch (status->float_rounding_mode) {
3856 case float_round_nearest_even:
3857 increment = (mask + 1) >> 1;
3858 if ((zSig & mask) == increment) {
3859 increment = zSig & (increment << 1);
3862 case float_round_ties_away:
3863 increment = (mask + 1) >> 1;
3865 case float_round_up:
3866 increment = zSign ? 0 : mask;
3868 case float_round_down:
3869 increment = zSign ? mask : 0;
3871 default: /* round_to_zero */
3876 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3878 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3880 float_raise(float_flag_overflow | float_flag_inexact, status);
3881 return packFloat16(zSign, 0x1f, 0);
3883 float_raise(float_flag_invalid, status);
3884 return packFloat16(zSign, 0x1f, 0x3ff);
3889 /* Note that flush-to-zero does not affect half-precision results */
3891 (status->float_detect_tininess == float_tininess_before_rounding)
3893 || (!rounding_bumps_exp);
3896 float_raise(float_flag_inexact, status);
3898 float_raise(float_flag_underflow, status);
3903 if (rounding_bumps_exp) {
3909 return packFloat16(zSign, 0, 0);
3915 return packFloat16(zSign, zExp, zSig >> 13);
3918 /*----------------------------------------------------------------------------
3919 | If `a' is denormal and we are in flush-to-zero mode then set the
3920 | input-denormal exception and return zero. Otherwise just return the value.
3921 *----------------------------------------------------------------------------*/
3922 float16 float16_squash_input_denormal(float16 a, float_status *status)
3924 if (status->flush_inputs_to_zero) {
3925 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3926 float_raise(float_flag_input_denormal, status);
3927 return make_float16(float16_val(a) & 0x8000);
3933 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3936 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3937 *zSigPtr = aSig << shiftCount;
3938 *zExpPtr = 1 - shiftCount;
3941 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3942 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3944 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3950 aSign = extractFloat16Sign(a);
3951 aExp = extractFloat16Exp(a);
3952 aSig = extractFloat16Frac(a);
3954 if (aExp == 0x1f && ieee) {
3956 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3958 return packFloat32(aSign, 0xff, 0);
3962 return packFloat32(aSign, 0, 0);
3965 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3968 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3971 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3977 a = float32_squash_input_denormal(a, status);
3979 aSig = extractFloat32Frac( a );
3980 aExp = extractFloat32Exp( a );
3981 aSign = extractFloat32Sign( a );
3982 if ( aExp == 0xFF ) {
3984 /* Input is a NaN */
3986 float_raise(float_flag_invalid, status);
3987 return packFloat16(aSign, 0, 0);
3989 return commonNaNToFloat16(
3990 float32ToCommonNaN(a, status), status);
3994 float_raise(float_flag_invalid, status);
3995 return packFloat16(aSign, 0x1f, 0x3ff);
3997 return packFloat16(aSign, 0x1f, 0);
3999 if (aExp == 0 && aSig == 0) {
4000 return packFloat16(aSign, 0, 0);
4002 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4003 * even if the input is denormal; however this is harmless because
4004 * the largest possible single-precision denormal is still smaller
4005 * than the smallest representable half-precision denormal, and so we
4006 * will end up ignoring aSig and returning via the "always return zero"
4012 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
4015 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
4021 aSign = extractFloat16Sign(a);
4022 aExp = extractFloat16Exp(a);
4023 aSig = extractFloat16Frac(a);
4025 if (aExp == 0x1f && ieee) {
4027 return commonNaNToFloat64(
4028 float16ToCommonNaN(a, status), status);
4030 return packFloat64(aSign, 0x7ff, 0);
4034 return packFloat64(aSign, 0, 0);
4037 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4040 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
4043 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
4050 a = float64_squash_input_denormal(a, status);
4052 aSig = extractFloat64Frac(a);
4053 aExp = extractFloat64Exp(a);
4054 aSign = extractFloat64Sign(a);
4055 if (aExp == 0x7FF) {
4057 /* Input is a NaN */
4059 float_raise(float_flag_invalid, status);
4060 return packFloat16(aSign, 0, 0);
4062 return commonNaNToFloat16(
4063 float64ToCommonNaN(a, status), status);
4067 float_raise(float_flag_invalid, status);
4068 return packFloat16(aSign, 0x1f, 0x3ff);
4070 return packFloat16(aSign, 0x1f, 0);
4072 shift64RightJamming(aSig, 29, &aSig);
4074 if (aExp == 0 && zSig == 0) {
4075 return packFloat16(aSign, 0, 0);
4077 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4078 * even if the input is denormal; however this is harmless because
4079 * the largest possible single-precision denormal is still smaller
4080 * than the smallest representable half-precision denormal, and so we
4081 * will end up ignoring aSig and returning via the "always return zero"
4087 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4090 /*----------------------------------------------------------------------------
4091 | Returns the result of converting the double-precision floating-point value
4092 | `a' to the extended double-precision floating-point format. The conversion
4093 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4095 *----------------------------------------------------------------------------*/
4097 floatx80 float64_to_floatx80(float64 a, float_status *status)
4103 a = float64_squash_input_denormal(a, status);
4104 aSig = extractFloat64Frac( a );
4105 aExp = extractFloat64Exp( a );
4106 aSign = extractFloat64Sign( a );
4107 if ( aExp == 0x7FF ) {
4109 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4111 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4114 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4115 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4119 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4123 /*----------------------------------------------------------------------------
4124 | Returns the result of converting the double-precision floating-point value
4125 | `a' to the quadruple-precision floating-point format. The conversion is
4126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4128 *----------------------------------------------------------------------------*/
4130 float128 float64_to_float128(float64 a, float_status *status)
4134 uint64_t aSig, zSig0, zSig1;
4136 a = float64_squash_input_denormal(a, status);
4137 aSig = extractFloat64Frac( a );
4138 aExp = extractFloat64Exp( a );
4139 aSign = extractFloat64Sign( a );
4140 if ( aExp == 0x7FF ) {
4142 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4144 return packFloat128( aSign, 0x7FFF, 0, 0 );
4147 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4148 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4151 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4152 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4157 /*----------------------------------------------------------------------------
4158 | Returns the remainder of the double-precision floating-point value `a'
4159 | with respect to the corresponding value `b'. The operation is performed
4160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4161 *----------------------------------------------------------------------------*/
4163 float64 float64_rem(float64 a, float64 b, float_status *status)
4166 int aExp, bExp, expDiff;
4167 uint64_t aSig, bSig;
4168 uint64_t q, alternateASig;
4171 a = float64_squash_input_denormal(a, status);
4172 b = float64_squash_input_denormal(b, status);
4173 aSig = extractFloat64Frac( a );
4174 aExp = extractFloat64Exp( a );
4175 aSign = extractFloat64Sign( a );
4176 bSig = extractFloat64Frac( b );
4177 bExp = extractFloat64Exp( b );
4178 if ( aExp == 0x7FF ) {
4179 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4180 return propagateFloat64NaN(a, b, status);
4182 float_raise(float_flag_invalid, status);
4183 return float64_default_nan(status);
4185 if ( bExp == 0x7FF ) {
4187 return propagateFloat64NaN(a, b, status);
4193 float_raise(float_flag_invalid, status);
4194 return float64_default_nan(status);
4196 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4199 if ( aSig == 0 ) return a;
4200 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4202 expDiff = aExp - bExp;
4203 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4204 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4205 if ( expDiff < 0 ) {
4206 if ( expDiff < -1 ) return a;
4209 q = ( bSig <= aSig );
4210 if ( q ) aSig -= bSig;
4212 while ( 0 < expDiff ) {
4213 q = estimateDiv128To64( aSig, 0, bSig );
4214 q = ( 2 < q ) ? q - 2 : 0;
4215 aSig = - ( ( bSig>>2 ) * q );
4219 if ( 0 < expDiff ) {
4220 q = estimateDiv128To64( aSig, 0, bSig );
4221 q = ( 2 < q ) ? q - 2 : 0;
4224 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4231 alternateASig = aSig;
4234 } while ( 0 <= (int64_t) aSig );
4235 sigMean = aSig + alternateASig;
4236 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4237 aSig = alternateASig;
4239 zSign = ( (int64_t) aSig < 0 );
4240 if ( zSign ) aSig = - aSig;
4241 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4245 /*----------------------------------------------------------------------------
4246 | Returns the binary log of the double-precision floating-point value `a'.
4247 | The operation is performed according to the IEC/IEEE Standard for Binary
4248 | Floating-Point Arithmetic.
4249 *----------------------------------------------------------------------------*/
4250 float64 float64_log2(float64 a, float_status *status)
4254 uint64_t aSig, aSig0, aSig1, zSig, i;
4255 a = float64_squash_input_denormal(a, status);
4257 aSig = extractFloat64Frac( a );
4258 aExp = extractFloat64Exp( a );
4259 aSign = extractFloat64Sign( a );
4262 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4263 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4266 float_raise(float_flag_invalid, status);
4267 return float64_default_nan(status);
4269 if ( aExp == 0x7FF ) {
4271 return propagateFloat64NaN(a, float64_zero, status);
4277 aSig |= LIT64( 0x0010000000000000 );
4279 zSig = (uint64_t)aExp << 52;
4280 for (i = 1LL << 51; i > 0; i >>= 1) {
4281 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4282 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4283 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4291 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4294 /*----------------------------------------------------------------------------
4295 | Returns 1 if the double-precision floating-point value `a' is equal to the
4296 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4297 | if either operand is a NaN. Otherwise, the comparison is performed
4298 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4299 *----------------------------------------------------------------------------*/
4301 int float64_eq(float64 a, float64 b, float_status *status)
4304 a = float64_squash_input_denormal(a, status);
4305 b = float64_squash_input_denormal(b, status);
4307 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4308 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4310 float_raise(float_flag_invalid, status);
4313 av = float64_val(a);
4314 bv = float64_val(b);
4315 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4319 /*----------------------------------------------------------------------------
4320 | Returns 1 if the double-precision floating-point value `a' is less than or
4321 | equal to the corresponding value `b', and 0 otherwise. The invalid
4322 | exception is raised if either operand is a NaN. The comparison is performed
4323 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4324 *----------------------------------------------------------------------------*/
4326 int float64_le(float64 a, float64 b, float_status *status)
4330 a = float64_squash_input_denormal(a, status);
4331 b = float64_squash_input_denormal(b, status);
4333 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4334 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4336 float_raise(float_flag_invalid, status);
4339 aSign = extractFloat64Sign( a );
4340 bSign = extractFloat64Sign( b );
4341 av = float64_val(a);
4342 bv = float64_val(b);
4343 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4344 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4348 /*----------------------------------------------------------------------------
4349 | Returns 1 if the double-precision floating-point value `a' is less than
4350 | the corresponding value `b', and 0 otherwise. The invalid exception is
4351 | raised if either operand is a NaN. The comparison is performed according
4352 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4353 *----------------------------------------------------------------------------*/
4355 int float64_lt(float64 a, float64 b, float_status *status)
4360 a = float64_squash_input_denormal(a, status);
4361 b = float64_squash_input_denormal(b, status);
4362 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4363 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4365 float_raise(float_flag_invalid, status);
4368 aSign = extractFloat64Sign( a );
4369 bSign = extractFloat64Sign( b );
4370 av = float64_val(a);
4371 bv = float64_val(b);
4372 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4373 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4377 /*----------------------------------------------------------------------------
4378 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4379 | be compared, and 0 otherwise. The invalid exception is raised if either
4380 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4381 | Standard for Binary Floating-Point Arithmetic.
4382 *----------------------------------------------------------------------------*/
4384 int float64_unordered(float64 a, float64 b, float_status *status)
4386 a = float64_squash_input_denormal(a, status);
4387 b = float64_squash_input_denormal(b, status);
4389 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4390 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4392 float_raise(float_flag_invalid, status);
4398 /*----------------------------------------------------------------------------
4399 | Returns 1 if the double-precision floating-point value `a' is equal to the
4400 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4401 | exception.The comparison is performed according to the IEC/IEEE Standard
4402 | for Binary Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4405 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4408 a = float64_squash_input_denormal(a, status);
4409 b = float64_squash_input_denormal(b, status);
4411 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4412 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4414 if (float64_is_signaling_nan(a, status)
4415 || float64_is_signaling_nan(b, status)) {
4416 float_raise(float_flag_invalid, status);
4420 av = float64_val(a);
4421 bv = float64_val(b);
4422 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4426 /*----------------------------------------------------------------------------
4427 | Returns 1 if the double-precision floating-point value `a' is less than or
4428 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4429 | cause an exception. Otherwise, the comparison is performed according to the
4430 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4431 *----------------------------------------------------------------------------*/
4433 int float64_le_quiet(float64 a, float64 b, float_status *status)
4437 a = float64_squash_input_denormal(a, status);
4438 b = float64_squash_input_denormal(b, status);
4440 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4441 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4443 if (float64_is_signaling_nan(a, status)
4444 || float64_is_signaling_nan(b, status)) {
4445 float_raise(float_flag_invalid, status);
4449 aSign = extractFloat64Sign( a );
4450 bSign = extractFloat64Sign( b );
4451 av = float64_val(a);
4452 bv = float64_val(b);
4453 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4454 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4458 /*----------------------------------------------------------------------------
4459 | Returns 1 if the double-precision floating-point value `a' is less than
4460 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4461 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4462 | Standard for Binary Floating-Point Arithmetic.
4463 *----------------------------------------------------------------------------*/
4465 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4469 a = float64_squash_input_denormal(a, status);
4470 b = float64_squash_input_denormal(b, status);
4472 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4473 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4475 if (float64_is_signaling_nan(a, status)
4476 || float64_is_signaling_nan(b, status)) {
4477 float_raise(float_flag_invalid, status);
4481 aSign = extractFloat64Sign( a );
4482 bSign = extractFloat64Sign( b );
4483 av = float64_val(a);
4484 bv = float64_val(b);
4485 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4486 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4490 /*----------------------------------------------------------------------------
4491 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4492 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4493 | comparison is performed according to the IEC/IEEE Standard for Binary
4494 | Floating-Point Arithmetic.
4495 *----------------------------------------------------------------------------*/
4497 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4499 a = float64_squash_input_denormal(a, status);
4500 b = float64_squash_input_denormal(b, status);
4502 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4503 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4505 if (float64_is_signaling_nan(a, status)
4506 || float64_is_signaling_nan(b, status)) {
4507 float_raise(float_flag_invalid, status);
4514 /*----------------------------------------------------------------------------
4515 | Returns the result of converting the extended double-precision floating-
4516 | point value `a' to the 32-bit two's complement integer format. The
4517 | conversion is performed according to the IEC/IEEE Standard for Binary
4518 | Floating-Point Arithmetic---which means in particular that the conversion
4519 | is rounded according to the current rounding mode. If `a' is a NaN, the
4520 | largest positive integer is returned. Otherwise, if the conversion
4521 | overflows, the largest integer with the same sign as `a' is returned.
4522 *----------------------------------------------------------------------------*/
4524 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4527 int32_t aExp, shiftCount;
4530 if (floatx80_invalid_encoding(a)) {
4531 float_raise(float_flag_invalid, status);
4534 aSig = extractFloatx80Frac( a );
4535 aExp = extractFloatx80Exp( a );
4536 aSign = extractFloatx80Sign( a );
4537 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4538 shiftCount = 0x4037 - aExp;
4539 if ( shiftCount <= 0 ) shiftCount = 1;
4540 shift64RightJamming( aSig, shiftCount, &aSig );
4541 return roundAndPackInt32(aSign, aSig, status);
4545 /*----------------------------------------------------------------------------
4546 | Returns the result of converting the extended double-precision floating-
4547 | point value `a' to the 32-bit two's complement integer format. The
4548 | conversion is performed according to the IEC/IEEE Standard for Binary
4549 | Floating-Point Arithmetic, except that the conversion is always rounded
4550 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4551 | Otherwise, if the conversion overflows, the largest integer with the same
4552 | sign as `a' is returned.
4553 *----------------------------------------------------------------------------*/
4555 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4558 int32_t aExp, shiftCount;
4559 uint64_t aSig, savedASig;
4562 if (floatx80_invalid_encoding(a)) {
4563 float_raise(float_flag_invalid, status);
4566 aSig = extractFloatx80Frac( a );
4567 aExp = extractFloatx80Exp( a );
4568 aSign = extractFloatx80Sign( a );
4569 if ( 0x401E < aExp ) {
4570 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4573 else if ( aExp < 0x3FFF ) {
4575 status->float_exception_flags |= float_flag_inexact;
4579 shiftCount = 0x403E - aExp;
4581 aSig >>= shiftCount;
4583 if ( aSign ) z = - z;
4584 if ( ( z < 0 ) ^ aSign ) {
4586 float_raise(float_flag_invalid, status);
4587 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4589 if ( ( aSig<<shiftCount ) != savedASig ) {
4590 status->float_exception_flags |= float_flag_inexact;
4596 /*----------------------------------------------------------------------------
4597 | Returns the result of converting the extended double-precision floating-
4598 | point value `a' to the 64-bit two's complement integer format. The
4599 | conversion is performed according to the IEC/IEEE Standard for Binary
4600 | Floating-Point Arithmetic---which means in particular that the conversion
4601 | is rounded according to the current rounding mode. If `a' is a NaN,
4602 | the largest positive integer is returned. Otherwise, if the conversion
4603 | overflows, the largest integer with the same sign as `a' is returned.
4604 *----------------------------------------------------------------------------*/
4606 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4609 int32_t aExp, shiftCount;
4610 uint64_t aSig, aSigExtra;
4612 if (floatx80_invalid_encoding(a)) {
4613 float_raise(float_flag_invalid, status);
4616 aSig = extractFloatx80Frac( a );
4617 aExp = extractFloatx80Exp( a );
4618 aSign = extractFloatx80Sign( a );
4619 shiftCount = 0x403E - aExp;
4620 if ( shiftCount <= 0 ) {
4622 float_raise(float_flag_invalid, status);
4624 || ( ( aExp == 0x7FFF )
4625 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4627 return LIT64( 0x7FFFFFFFFFFFFFFF );
4629 return (int64_t) LIT64( 0x8000000000000000 );
4634 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4636 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4640 /*----------------------------------------------------------------------------
4641 | Returns the result of converting the extended double-precision floating-
4642 | point value `a' to the 64-bit two's complement integer format. The
4643 | conversion is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic, except that the conversion is always rounded
4645 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4646 | Otherwise, if the conversion overflows, the largest integer with the same
4647 | sign as `a' is returned.
4648 *----------------------------------------------------------------------------*/
4650 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4653 int32_t aExp, shiftCount;
4657 if (floatx80_invalid_encoding(a)) {
4658 float_raise(float_flag_invalid, status);
4661 aSig = extractFloatx80Frac( a );
4662 aExp = extractFloatx80Exp( a );
4663 aSign = extractFloatx80Sign( a );
4664 shiftCount = aExp - 0x403E;
4665 if ( 0 <= shiftCount ) {
4666 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4667 if ( ( a.high != 0xC03E ) || aSig ) {
4668 float_raise(float_flag_invalid, status);
4669 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4670 return LIT64( 0x7FFFFFFFFFFFFFFF );
4673 return (int64_t) LIT64( 0x8000000000000000 );
4675 else if ( aExp < 0x3FFF ) {
4677 status->float_exception_flags |= float_flag_inexact;
4681 z = aSig>>( - shiftCount );
4682 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4683 status->float_exception_flags |= float_flag_inexact;
4685 if ( aSign ) z = - z;
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the single-precision floating-point format. The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4697 float32 floatx80_to_float32(floatx80 a, float_status *status)
4703 if (floatx80_invalid_encoding(a)) {
4704 float_raise(float_flag_invalid, status);
4705 return float32_default_nan(status);
4707 aSig = extractFloatx80Frac( a );
4708 aExp = extractFloatx80Exp( a );
4709 aSign = extractFloatx80Sign( a );
4710 if ( aExp == 0x7FFF ) {
4711 if ( (uint64_t) ( aSig<<1 ) ) {
4712 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4714 return packFloat32( aSign, 0xFF, 0 );
4716 shift64RightJamming( aSig, 33, &aSig );
4717 if ( aExp || aSig ) aExp -= 0x3F81;
4718 return roundAndPackFloat32(aSign, aExp, aSig, status);
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the extended double-precision floating-
4724 | point value `a' to the double-precision floating-point format. The
4725 | conversion is performed according to the IEC/IEEE Standard for Binary
4726 | Floating-Point Arithmetic.
4727 *----------------------------------------------------------------------------*/
4729 float64 floatx80_to_float64(floatx80 a, float_status *status)
4733 uint64_t aSig, zSig;
4735 if (floatx80_invalid_encoding(a)) {
4736 float_raise(float_flag_invalid, status);
4737 return float64_default_nan(status);
4739 aSig = extractFloatx80Frac( a );
4740 aExp = extractFloatx80Exp( a );
4741 aSign = extractFloatx80Sign( a );
4742 if ( aExp == 0x7FFF ) {
4743 if ( (uint64_t) ( aSig<<1 ) ) {
4744 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4746 return packFloat64( aSign, 0x7FF, 0 );
4748 shift64RightJamming( aSig, 1, &zSig );
4749 if ( aExp || aSig ) aExp -= 0x3C01;
4750 return roundAndPackFloat64(aSign, aExp, zSig, status);
4754 /*----------------------------------------------------------------------------
4755 | Returns the result of converting the extended double-precision floating-
4756 | point value `a' to the quadruple-precision floating-point format. The
4757 | conversion is performed according to the IEC/IEEE Standard for Binary
4758 | Floating-Point Arithmetic.
4759 *----------------------------------------------------------------------------*/
4761 float128 floatx80_to_float128(floatx80 a, float_status *status)
4765 uint64_t aSig, zSig0, zSig1;
4767 if (floatx80_invalid_encoding(a)) {
4768 float_raise(float_flag_invalid, status);
4769 return float128_default_nan(status);
4771 aSig = extractFloatx80Frac( a );
4772 aExp = extractFloatx80Exp( a );
4773 aSign = extractFloatx80Sign( a );
4774 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4775 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4777 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4778 return packFloat128( aSign, aExp, zSig0, zSig1 );
4782 /*----------------------------------------------------------------------------
4783 | Rounds the extended double-precision floating-point value `a'
4784 | to the precision provided by floatx80_rounding_precision and returns the
4785 | result as an extended double-precision floating-point value.
4786 | The operation is performed according to the IEC/IEEE Standard for Binary
4787 | Floating-Point Arithmetic.
4788 *----------------------------------------------------------------------------*/
4790 floatx80 floatx80_round(floatx80 a, float_status *status)
4792 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4793 extractFloatx80Sign(a),
4794 extractFloatx80Exp(a),
4795 extractFloatx80Frac(a), 0, status);
4798 /*----------------------------------------------------------------------------
4799 | Rounds the extended double-precision floating-point value `a' to an integer,
4800 | and returns the result as an extended quadruple-precision floating-point
4801 | value. The operation is performed according to the IEC/IEEE Standard for
4802 | Binary Floating-Point Arithmetic.
4803 *----------------------------------------------------------------------------*/
4805 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4809 uint64_t lastBitMask, roundBitsMask;
4812 if (floatx80_invalid_encoding(a)) {
4813 float_raise(float_flag_invalid, status);
4814 return floatx80_default_nan(status);
4816 aExp = extractFloatx80Exp( a );
4817 if ( 0x403E <= aExp ) {
4818 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4819 return propagateFloatx80NaN(a, a, status);
4823 if ( aExp < 0x3FFF ) {
4825 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4828 status->float_exception_flags |= float_flag_inexact;
4829 aSign = extractFloatx80Sign( a );
4830 switch (status->float_rounding_mode) {
4831 case float_round_nearest_even:
4832 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4835 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4838 case float_round_ties_away:
4839 if (aExp == 0x3FFE) {
4840 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4843 case float_round_down:
4846 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4847 : packFloatx80( 0, 0, 0 );
4848 case float_round_up:
4850 aSign ? packFloatx80( 1, 0, 0 )
4851 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4853 return packFloatx80( aSign, 0, 0 );
4856 lastBitMask <<= 0x403E - aExp;
4857 roundBitsMask = lastBitMask - 1;
4859 switch (status->float_rounding_mode) {
4860 case float_round_nearest_even:
4861 z.low += lastBitMask>>1;
4862 if ((z.low & roundBitsMask) == 0) {
4863 z.low &= ~lastBitMask;
4866 case float_round_ties_away:
4867 z.low += lastBitMask >> 1;
4869 case float_round_to_zero:
4871 case float_round_up:
4872 if (!extractFloatx80Sign(z)) {
4873 z.low += roundBitsMask;
4876 case float_round_down:
4877 if (extractFloatx80Sign(z)) {
4878 z.low += roundBitsMask;
4884 z.low &= ~ roundBitsMask;
4887 z.low = LIT64( 0x8000000000000000 );
4889 if (z.low != a.low) {
4890 status->float_exception_flags |= float_flag_inexact;
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of adding the absolute values of the extended double-
4898 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4899 | negated before being returned. `zSign' is ignored if the result is a NaN.
4900 | The addition is performed according to the IEC/IEEE Standard for Binary
4901 | Floating-Point Arithmetic.
4902 *----------------------------------------------------------------------------*/
4904 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4905 float_status *status)
4907 int32_t aExp, bExp, zExp;
4908 uint64_t aSig, bSig, zSig0, zSig1;
4911 aSig = extractFloatx80Frac( a );
4912 aExp = extractFloatx80Exp( a );
4913 bSig = extractFloatx80Frac( b );
4914 bExp = extractFloatx80Exp( b );
4915 expDiff = aExp - bExp;
4916 if ( 0 < expDiff ) {
4917 if ( aExp == 0x7FFF ) {
4918 if ((uint64_t)(aSig << 1)) {
4919 return propagateFloatx80NaN(a, b, status);
4923 if ( bExp == 0 ) --expDiff;
4924 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4927 else if ( expDiff < 0 ) {
4928 if ( bExp == 0x7FFF ) {
4929 if ((uint64_t)(bSig << 1)) {
4930 return propagateFloatx80NaN(a, b, status);
4932 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4934 if ( aExp == 0 ) ++expDiff;
4935 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4939 if ( aExp == 0x7FFF ) {
4940 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4941 return propagateFloatx80NaN(a, b, status);
4946 zSig0 = aSig + bSig;
4948 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4954 zSig0 = aSig + bSig;
4955 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4957 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4958 zSig0 |= LIT64( 0x8000000000000000 );
4961 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4962 zSign, zExp, zSig0, zSig1, status);
4965 /*----------------------------------------------------------------------------
4966 | Returns the result of subtracting the absolute values of the extended
4967 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4968 | difference is negated before being returned. `zSign' is ignored if the
4969 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4970 | Standard for Binary Floating-Point Arithmetic.
4971 *----------------------------------------------------------------------------*/
4973 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4974 float_status *status)
4976 int32_t aExp, bExp, zExp;
4977 uint64_t aSig, bSig, zSig0, zSig1;
4980 aSig = extractFloatx80Frac( a );
4981 aExp = extractFloatx80Exp( a );
4982 bSig = extractFloatx80Frac( b );
4983 bExp = extractFloatx80Exp( b );
4984 expDiff = aExp - bExp;
4985 if ( 0 < expDiff ) goto aExpBigger;
4986 if ( expDiff < 0 ) goto bExpBigger;
4987 if ( aExp == 0x7FFF ) {
4988 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4989 return propagateFloatx80NaN(a, b, status);
4991 float_raise(float_flag_invalid, status);
4992 return floatx80_default_nan(status);
4999 if ( bSig < aSig ) goto aBigger;
5000 if ( aSig < bSig ) goto bBigger;
5001 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5003 if ( bExp == 0x7FFF ) {
5004 if ((uint64_t)(bSig << 1)) {
5005 return propagateFloatx80NaN(a, b, status);
5007 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5009 if ( aExp == 0 ) ++expDiff;
5010 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5012 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5015 goto normalizeRoundAndPack;
5017 if ( aExp == 0x7FFF ) {
5018 if ((uint64_t)(aSig << 1)) {
5019 return propagateFloatx80NaN(a, b, status);
5023 if ( bExp == 0 ) --expDiff;
5024 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5026 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5028 normalizeRoundAndPack:
5029 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5030 zSign, zExp, zSig0, zSig1, status);
5033 /*----------------------------------------------------------------------------
5034 | Returns the result of adding the extended double-precision floating-point
5035 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5036 | Standard for Binary Floating-Point Arithmetic.
5037 *----------------------------------------------------------------------------*/
5039 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5043 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5044 float_raise(float_flag_invalid, status);
5045 return floatx80_default_nan(status);
5047 aSign = extractFloatx80Sign( a );
5048 bSign = extractFloatx80Sign( b );
5049 if ( aSign == bSign ) {
5050 return addFloatx80Sigs(a, b, aSign, status);
5053 return subFloatx80Sigs(a, b, aSign, status);
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of subtracting the extended double-precision floating-
5060 | point values `a' and `b'. The operation is performed according to the
5061 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5064 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5068 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5069 float_raise(float_flag_invalid, status);
5070 return floatx80_default_nan(status);
5072 aSign = extractFloatx80Sign( a );
5073 bSign = extractFloatx80Sign( b );
5074 if ( aSign == bSign ) {
5075 return subFloatx80Sigs(a, b, aSign, status);
5078 return addFloatx80Sigs(a, b, aSign, status);
5083 /*----------------------------------------------------------------------------
5084 | Returns the result of multiplying the extended double-precision floating-
5085 | point values `a' and `b'. The operation is performed according to the
5086 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5087 *----------------------------------------------------------------------------*/
5089 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5091 flag aSign, bSign, zSign;
5092 int32_t aExp, bExp, zExp;
5093 uint64_t aSig, bSig, zSig0, zSig1;
5095 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5096 float_raise(float_flag_invalid, status);
5097 return floatx80_default_nan(status);
5099 aSig = extractFloatx80Frac( a );
5100 aExp = extractFloatx80Exp( a );
5101 aSign = extractFloatx80Sign( a );
5102 bSig = extractFloatx80Frac( b );
5103 bExp = extractFloatx80Exp( b );
5104 bSign = extractFloatx80Sign( b );
5105 zSign = aSign ^ bSign;
5106 if ( aExp == 0x7FFF ) {
5107 if ( (uint64_t) ( aSig<<1 )
5108 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5109 return propagateFloatx80NaN(a, b, status);
5111 if ( ( bExp | bSig ) == 0 ) goto invalid;
5112 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5114 if ( bExp == 0x7FFF ) {
5115 if ((uint64_t)(bSig << 1)) {
5116 return propagateFloatx80NaN(a, b, status);
5118 if ( ( aExp | aSig ) == 0 ) {
5120 float_raise(float_flag_invalid, status);
5121 return floatx80_default_nan(status);
5123 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5126 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5127 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5130 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5131 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5133 zExp = aExp + bExp - 0x3FFE;
5134 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5135 if ( 0 < (int64_t) zSig0 ) {
5136 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5139 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5140 zSign, zExp, zSig0, zSig1, status);
5143 /*----------------------------------------------------------------------------
5144 | Returns the result of dividing the extended double-precision floating-point
5145 | value `a' by the corresponding value `b'. The operation is performed
5146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5147 *----------------------------------------------------------------------------*/
5149 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5151 flag aSign, bSign, zSign;
5152 int32_t aExp, bExp, zExp;
5153 uint64_t aSig, bSig, zSig0, zSig1;
5154 uint64_t rem0, rem1, rem2, term0, term1, term2;
5156 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5157 float_raise(float_flag_invalid, status);
5158 return floatx80_default_nan(status);
5160 aSig = extractFloatx80Frac( a );
5161 aExp = extractFloatx80Exp( a );
5162 aSign = extractFloatx80Sign( a );
5163 bSig = extractFloatx80Frac( b );
5164 bExp = extractFloatx80Exp( b );
5165 bSign = extractFloatx80Sign( b );
5166 zSign = aSign ^ bSign;
5167 if ( aExp == 0x7FFF ) {
5168 if ((uint64_t)(aSig << 1)) {
5169 return propagateFloatx80NaN(a, b, status);
5171 if ( bExp == 0x7FFF ) {
5172 if ((uint64_t)(bSig << 1)) {
5173 return propagateFloatx80NaN(a, b, status);
5177 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5179 if ( bExp == 0x7FFF ) {
5180 if ((uint64_t)(bSig << 1)) {
5181 return propagateFloatx80NaN(a, b, status);
5183 return packFloatx80( zSign, 0, 0 );
5187 if ( ( aExp | aSig ) == 0 ) {
5189 float_raise(float_flag_invalid, status);
5190 return floatx80_default_nan(status);
5192 float_raise(float_flag_divbyzero, status);
5193 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5195 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5198 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5199 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5201 zExp = aExp - bExp + 0x3FFE;
5203 if ( bSig <= aSig ) {
5204 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5207 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5208 mul64To128( bSig, zSig0, &term0, &term1 );
5209 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5210 while ( (int64_t) rem0 < 0 ) {
5212 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5214 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5215 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5216 mul64To128( bSig, zSig1, &term1, &term2 );
5217 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5218 while ( (int64_t) rem1 < 0 ) {
5220 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5222 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5224 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5225 zSign, zExp, zSig0, zSig1, status);
5228 /*----------------------------------------------------------------------------
5229 | Returns the remainder of the extended double-precision floating-point value
5230 | `a' with respect to the corresponding value `b'. The operation is performed
5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5232 *----------------------------------------------------------------------------*/
5234 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5237 int32_t aExp, bExp, expDiff;
5238 uint64_t aSig0, aSig1, bSig;
5239 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5241 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5242 float_raise(float_flag_invalid, status);
5243 return floatx80_default_nan(status);
5245 aSig0 = extractFloatx80Frac( a );
5246 aExp = extractFloatx80Exp( a );
5247 aSign = extractFloatx80Sign( a );
5248 bSig = extractFloatx80Frac( b );
5249 bExp = extractFloatx80Exp( b );
5250 if ( aExp == 0x7FFF ) {
5251 if ( (uint64_t) ( aSig0<<1 )
5252 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5253 return propagateFloatx80NaN(a, b, status);
5257 if ( bExp == 0x7FFF ) {
5258 if ((uint64_t)(bSig << 1)) {
5259 return propagateFloatx80NaN(a, b, status);
5266 float_raise(float_flag_invalid, status);
5267 return floatx80_default_nan(status);
5269 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5272 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5273 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5275 bSig |= LIT64( 0x8000000000000000 );
5277 expDiff = aExp - bExp;
5279 if ( expDiff < 0 ) {
5280 if ( expDiff < -1 ) return a;
5281 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5284 q = ( bSig <= aSig0 );
5285 if ( q ) aSig0 -= bSig;
5287 while ( 0 < expDiff ) {
5288 q = estimateDiv128To64( aSig0, aSig1, bSig );
5289 q = ( 2 < q ) ? q - 2 : 0;
5290 mul64To128( bSig, q, &term0, &term1 );
5291 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5292 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5296 if ( 0 < expDiff ) {
5297 q = estimateDiv128To64( aSig0, aSig1, bSig );
5298 q = ( 2 < q ) ? q - 2 : 0;
5300 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5301 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5302 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5303 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5305 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5312 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5313 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5314 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5317 aSig0 = alternateASig0;
5318 aSig1 = alternateASig1;
5322 normalizeRoundAndPackFloatx80(
5323 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5327 /*----------------------------------------------------------------------------
5328 | Returns the square root of the extended double-precision floating-point
5329 | value `a'. The operation is performed according to the IEC/IEEE Standard
5330 | for Binary Floating-Point Arithmetic.
5331 *----------------------------------------------------------------------------*/
5333 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5337 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5338 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5340 if (floatx80_invalid_encoding(a)) {
5341 float_raise(float_flag_invalid, status);
5342 return floatx80_default_nan(status);
5344 aSig0 = extractFloatx80Frac( a );
5345 aExp = extractFloatx80Exp( a );
5346 aSign = extractFloatx80Sign( a );
5347 if ( aExp == 0x7FFF ) {
5348 if ((uint64_t)(aSig0 << 1)) {
5349 return propagateFloatx80NaN(a, a, status);
5351 if ( ! aSign ) return a;
5355 if ( ( aExp | aSig0 ) == 0 ) return a;
5357 float_raise(float_flag_invalid, status);
5358 return floatx80_default_nan(status);
5361 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5362 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5364 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5365 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5366 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5367 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5368 doubleZSig0 = zSig0<<1;
5369 mul64To128( zSig0, zSig0, &term0, &term1 );
5370 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5371 while ( (int64_t) rem0 < 0 ) {
5374 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5376 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5377 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5378 if ( zSig1 == 0 ) zSig1 = 1;
5379 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5380 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5381 mul64To128( zSig1, zSig1, &term2, &term3 );
5382 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5383 while ( (int64_t) rem1 < 0 ) {
5385 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5387 term2 |= doubleZSig0;
5388 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5390 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5392 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5393 zSig0 |= doubleZSig0;
5394 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5395 0, zExp, zSig0, zSig1, status);
5398 /*----------------------------------------------------------------------------
5399 | Returns 1 if the extended double-precision floating-point value `a' is equal
5400 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5401 | raised if either operand is a NaN. Otherwise, the comparison is performed
5402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5403 *----------------------------------------------------------------------------*/
5405 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5408 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5409 || (extractFloatx80Exp(a) == 0x7FFF
5410 && (uint64_t) (extractFloatx80Frac(a) << 1))
5411 || (extractFloatx80Exp(b) == 0x7FFF
5412 && (uint64_t) (extractFloatx80Frac(b) << 1))
5414 float_raise(float_flag_invalid, status);
5419 && ( ( a.high == b.high )
5421 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5426 /*----------------------------------------------------------------------------
5427 | Returns 1 if the extended double-precision floating-point value `a' is
5428 | less than or equal to the corresponding value `b', and 0 otherwise. The
5429 | invalid exception is raised if either operand is a NaN. The comparison is
5430 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5432 *----------------------------------------------------------------------------*/
5434 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5438 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5439 || (extractFloatx80Exp(a) == 0x7FFF
5440 && (uint64_t) (extractFloatx80Frac(a) << 1))
5441 || (extractFloatx80Exp(b) == 0x7FFF
5442 && (uint64_t) (extractFloatx80Frac(b) << 1))
5444 float_raise(float_flag_invalid, status);
5447 aSign = extractFloatx80Sign( a );
5448 bSign = extractFloatx80Sign( b );
5449 if ( aSign != bSign ) {
5452 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5456 aSign ? le128( b.high, b.low, a.high, a.low )
5457 : le128( a.high, a.low, b.high, b.low );
5461 /*----------------------------------------------------------------------------
5462 | Returns 1 if the extended double-precision floating-point value `a' is
5463 | less than the corresponding value `b', and 0 otherwise. The invalid
5464 | exception is raised if either operand is a NaN. The comparison is performed
5465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5466 *----------------------------------------------------------------------------*/
5468 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5472 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5473 || (extractFloatx80Exp(a) == 0x7FFF
5474 && (uint64_t) (extractFloatx80Frac(a) << 1))
5475 || (extractFloatx80Exp(b) == 0x7FFF
5476 && (uint64_t) (extractFloatx80Frac(b) << 1))
5478 float_raise(float_flag_invalid, status);
5481 aSign = extractFloatx80Sign( a );
5482 bSign = extractFloatx80Sign( b );
5483 if ( aSign != bSign ) {
5486 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5490 aSign ? lt128( b.high, b.low, a.high, a.low )
5491 : lt128( a.high, a.low, b.high, b.low );
5495 /*----------------------------------------------------------------------------
5496 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5497 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5498 | either operand is a NaN. The comparison is performed according to the
5499 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5500 *----------------------------------------------------------------------------*/
5501 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5504 || (extractFloatx80Exp(a) == 0x7FFF
5505 && (uint64_t) (extractFloatx80Frac(a) << 1))
5506 || (extractFloatx80Exp(b) == 0x7FFF
5507 && (uint64_t) (extractFloatx80Frac(b) << 1))
5509 float_raise(float_flag_invalid, status);
5515 /*----------------------------------------------------------------------------
5516 | Returns 1 if the extended double-precision floating-point value `a' is
5517 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5518 | cause an exception. The comparison is performed according to the IEC/IEEE
5519 | Standard for Binary Floating-Point Arithmetic.
5520 *----------------------------------------------------------------------------*/
5522 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5525 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5526 float_raise(float_flag_invalid, status);
5529 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5530 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5531 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5532 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5534 if (floatx80_is_signaling_nan(a, status)
5535 || floatx80_is_signaling_nan(b, status)) {
5536 float_raise(float_flag_invalid, status);
5542 && ( ( a.high == b.high )
5544 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5549 /*----------------------------------------------------------------------------
5550 | Returns 1 if the extended double-precision floating-point value `a' is less
5551 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5552 | do not cause an exception. Otherwise, the comparison is performed according
5553 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5554 *----------------------------------------------------------------------------*/
5556 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5560 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5561 float_raise(float_flag_invalid, status);
5564 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5565 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5566 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5567 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5569 if (floatx80_is_signaling_nan(a, status)
5570 || floatx80_is_signaling_nan(b, status)) {
5571 float_raise(float_flag_invalid, status);
5575 aSign = extractFloatx80Sign( a );
5576 bSign = extractFloatx80Sign( b );
5577 if ( aSign != bSign ) {
5580 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5584 aSign ? le128( b.high, b.low, a.high, a.low )
5585 : le128( a.high, a.low, b.high, b.low );
5589 /*----------------------------------------------------------------------------
5590 | Returns 1 if the extended double-precision floating-point value `a' is less
5591 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5592 | an exception. Otherwise, the comparison is performed according to the
5593 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5594 *----------------------------------------------------------------------------*/
5596 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5600 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5601 float_raise(float_flag_invalid, status);
5604 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5605 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5606 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5607 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5609 if (floatx80_is_signaling_nan(a, status)
5610 || floatx80_is_signaling_nan(b, status)) {
5611 float_raise(float_flag_invalid, status);
5615 aSign = extractFloatx80Sign( a );
5616 bSign = extractFloatx80Sign( b );
5617 if ( aSign != bSign ) {
5620 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5624 aSign ? lt128( b.high, b.low, a.high, a.low )
5625 : lt128( a.high, a.low, b.high, b.low );
5629 /*----------------------------------------------------------------------------
5630 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5631 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5632 | The comparison is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic.
5634 *----------------------------------------------------------------------------*/
5635 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5637 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5638 float_raise(float_flag_invalid, status);
5641 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5642 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5643 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5644 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5646 if (floatx80_is_signaling_nan(a, status)
5647 || floatx80_is_signaling_nan(b, status)) {
5648 float_raise(float_flag_invalid, status);
5655 /*----------------------------------------------------------------------------
5656 | Returns the result of converting the quadruple-precision floating-point
5657 | value `a' to the 32-bit two's complement integer format. The conversion
5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic---which means in particular that the conversion is rounded
5660 | according to the current rounding mode. If `a' is a NaN, the largest
5661 | positive integer is returned. Otherwise, if the conversion overflows, the
5662 | largest integer with the same sign as `a' is returned.
5663 *----------------------------------------------------------------------------*/
5665 int32_t float128_to_int32(float128 a, float_status *status)
5668 int32_t aExp, shiftCount;
5669 uint64_t aSig0, aSig1;
5671 aSig1 = extractFloat128Frac1( a );
5672 aSig0 = extractFloat128Frac0( a );
5673 aExp = extractFloat128Exp( a );
5674 aSign = extractFloat128Sign( a );
5675 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5676 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5677 aSig0 |= ( aSig1 != 0 );
5678 shiftCount = 0x4028 - aExp;
5679 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5680 return roundAndPackInt32(aSign, aSig0, status);
5684 /*----------------------------------------------------------------------------
5685 | Returns the result of converting the quadruple-precision floating-point
5686 | value `a' to the 32-bit two's complement integer format. The conversion
5687 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5688 | Arithmetic, except that the conversion is always rounded toward zero. If
5689 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5690 | conversion overflows, the largest integer with the same sign as `a' is
5692 *----------------------------------------------------------------------------*/
5694 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5697 int32_t aExp, shiftCount;
5698 uint64_t aSig0, aSig1, savedASig;
5701 aSig1 = extractFloat128Frac1( a );
5702 aSig0 = extractFloat128Frac0( a );
5703 aExp = extractFloat128Exp( a );
5704 aSign = extractFloat128Sign( a );
5705 aSig0 |= ( aSig1 != 0 );
5706 if ( 0x401E < aExp ) {
5707 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5710 else if ( aExp < 0x3FFF ) {
5711 if (aExp || aSig0) {
5712 status->float_exception_flags |= float_flag_inexact;
5716 aSig0 |= LIT64( 0x0001000000000000 );
5717 shiftCount = 0x402F - aExp;
5719 aSig0 >>= shiftCount;
5721 if ( aSign ) z = - z;
5722 if ( ( z < 0 ) ^ aSign ) {
5724 float_raise(float_flag_invalid, status);
5725 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5727 if ( ( aSig0<<shiftCount ) != savedASig ) {
5728 status->float_exception_flags |= float_flag_inexact;
5734 /*----------------------------------------------------------------------------
5735 | Returns the result of converting the quadruple-precision floating-point
5736 | value `a' to the 64-bit two's complement integer format. The conversion
5737 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5738 | Arithmetic---which means in particular that the conversion is rounded
5739 | according to the current rounding mode. If `a' is a NaN, the largest
5740 | positive integer is returned. Otherwise, if the conversion overflows, the
5741 | largest integer with the same sign as `a' is returned.
5742 *----------------------------------------------------------------------------*/
5744 int64_t float128_to_int64(float128 a, float_status *status)
5747 int32_t aExp, shiftCount;
5748 uint64_t aSig0, aSig1;
5750 aSig1 = extractFloat128Frac1( a );
5751 aSig0 = extractFloat128Frac0( a );
5752 aExp = extractFloat128Exp( a );
5753 aSign = extractFloat128Sign( a );
5754 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5755 shiftCount = 0x402F - aExp;
5756 if ( shiftCount <= 0 ) {
5757 if ( 0x403E < aExp ) {
5758 float_raise(float_flag_invalid, status);
5760 || ( ( aExp == 0x7FFF )
5761 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5764 return LIT64( 0x7FFFFFFFFFFFFFFF );
5766 return (int64_t) LIT64( 0x8000000000000000 );
5768 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5771 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5773 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5777 /*----------------------------------------------------------------------------
5778 | Returns the result of converting the quadruple-precision floating-point
5779 | value `a' to the 64-bit two's complement integer format. The conversion
5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5781 | Arithmetic, except that the conversion is always rounded toward zero.
5782 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5783 | the conversion overflows, the largest integer with the same sign as `a' is
5785 *----------------------------------------------------------------------------*/
5787 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5790 int32_t aExp, shiftCount;
5791 uint64_t aSig0, aSig1;
5794 aSig1 = extractFloat128Frac1( a );
5795 aSig0 = extractFloat128Frac0( a );
5796 aExp = extractFloat128Exp( a );
5797 aSign = extractFloat128Sign( a );
5798 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5799 shiftCount = aExp - 0x402F;
5800 if ( 0 < shiftCount ) {
5801 if ( 0x403E <= aExp ) {
5802 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5803 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5804 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5806 status->float_exception_flags |= float_flag_inexact;
5810 float_raise(float_flag_invalid, status);
5811 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5812 return LIT64( 0x7FFFFFFFFFFFFFFF );
5815 return (int64_t) LIT64( 0x8000000000000000 );
5817 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5818 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5819 status->float_exception_flags |= float_flag_inexact;
5823 if ( aExp < 0x3FFF ) {
5824 if ( aExp | aSig0 | aSig1 ) {
5825 status->float_exception_flags |= float_flag_inexact;
5829 z = aSig0>>( - shiftCount );
5831 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5832 status->float_exception_flags |= float_flag_inexact;
5835 if ( aSign ) z = - z;
5840 /*----------------------------------------------------------------------------
5841 | Returns the result of converting the quadruple-precision floating-point value
5842 | `a' to the 64-bit unsigned integer format. The conversion is
5843 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5844 | Arithmetic---which means in particular that the conversion is rounded
5845 | according to the current rounding mode. If `a' is a NaN, the largest
5846 | positive integer is returned. If the conversion overflows, the
5847 | largest unsigned integer is returned. If 'a' is negative, the value is
5848 | rounded and zero is returned; negative values that do not round to zero
5849 | will raise the inexact exception.
5850 *----------------------------------------------------------------------------*/
5852 uint64_t float128_to_uint64(float128 a, float_status *status)
5857 uint64_t aSig0, aSig1;
5859 aSig0 = extractFloat128Frac0(a);
5860 aSig1 = extractFloat128Frac1(a);
5861 aExp = extractFloat128Exp(a);
5862 aSign = extractFloat128Sign(a);
5863 if (aSign && (aExp > 0x3FFE)) {
5864 float_raise(float_flag_invalid, status);
5865 if (float128_is_any_nan(a)) {
5866 return LIT64(0xFFFFFFFFFFFFFFFF);
5872 aSig0 |= LIT64(0x0001000000000000);
5874 shiftCount = 0x402F - aExp;
5875 if (shiftCount <= 0) {
5876 if (0x403E < aExp) {
5877 float_raise(float_flag_invalid, status);
5878 return LIT64(0xFFFFFFFFFFFFFFFF);
5880 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5882 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5884 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5887 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5890 signed char current_rounding_mode = status->float_rounding_mode;
5892 set_float_rounding_mode(float_round_to_zero, status);
5893 v = float128_to_uint64(a, status);
5894 set_float_rounding_mode(current_rounding_mode, status);
5899 /*----------------------------------------------------------------------------
5900 | Returns the result of converting the quadruple-precision floating-point
5901 | value `a' to the 32-bit unsigned integer format. The conversion
5902 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5903 | Arithmetic except that the conversion is always rounded toward zero.
5904 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5905 | if the conversion overflows, the largest unsigned integer is returned.
5906 | If 'a' is negative, the value is rounded and zero is returned; negative
5907 | values that do not round to zero will raise the inexact exception.
5908 *----------------------------------------------------------------------------*/
5910 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5914 int old_exc_flags = get_float_exception_flags(status);
5916 v = float128_to_uint64_round_to_zero(a, status);
5917 if (v > 0xffffffff) {
5922 set_float_exception_flags(old_exc_flags, status);
5923 float_raise(float_flag_invalid, status);
5927 /*----------------------------------------------------------------------------
5928 | Returns the result of converting the quadruple-precision floating-point
5929 | value `a' to the single-precision floating-point format. The conversion
5930 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5932 *----------------------------------------------------------------------------*/
5934 float32 float128_to_float32(float128 a, float_status *status)
5938 uint64_t aSig0, aSig1;
5941 aSig1 = extractFloat128Frac1( a );
5942 aSig0 = extractFloat128Frac0( a );
5943 aExp = extractFloat128Exp( a );
5944 aSign = extractFloat128Sign( a );
5945 if ( aExp == 0x7FFF ) {
5946 if ( aSig0 | aSig1 ) {
5947 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5949 return packFloat32( aSign, 0xFF, 0 );
5951 aSig0 |= ( aSig1 != 0 );
5952 shift64RightJamming( aSig0, 18, &aSig0 );
5954 if ( aExp || zSig ) {
5958 return roundAndPackFloat32(aSign, aExp, zSig, status);
5962 /*----------------------------------------------------------------------------
5963 | Returns the result of converting the quadruple-precision floating-point
5964 | value `a' to the double-precision floating-point format. The conversion
5965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5967 *----------------------------------------------------------------------------*/
5969 float64 float128_to_float64(float128 a, float_status *status)
5973 uint64_t aSig0, aSig1;
5975 aSig1 = extractFloat128Frac1( a );
5976 aSig0 = extractFloat128Frac0( a );
5977 aExp = extractFloat128Exp( a );
5978 aSign = extractFloat128Sign( a );
5979 if ( aExp == 0x7FFF ) {
5980 if ( aSig0 | aSig1 ) {
5981 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5983 return packFloat64( aSign, 0x7FF, 0 );
5985 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5986 aSig0 |= ( aSig1 != 0 );
5987 if ( aExp || aSig0 ) {
5988 aSig0 |= LIT64( 0x4000000000000000 );
5991 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of converting the quadruple-precision floating-point
5997 | value `a' to the extended double-precision floating-point format. The
5998 | conversion is performed according to the IEC/IEEE Standard for Binary
5999 | Floating-Point Arithmetic.
6000 *----------------------------------------------------------------------------*/
6002 floatx80 float128_to_floatx80(float128 a, float_status *status)
6006 uint64_t aSig0, aSig1;
6008 aSig1 = extractFloat128Frac1( a );
6009 aSig0 = extractFloat128Frac0( a );
6010 aExp = extractFloat128Exp( a );
6011 aSign = extractFloat128Sign( a );
6012 if ( aExp == 0x7FFF ) {
6013 if ( aSig0 | aSig1 ) {
6014 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6016 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6019 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6020 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6023 aSig0 |= LIT64( 0x0001000000000000 );
6025 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6026 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6030 /*----------------------------------------------------------------------------
6031 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6032 | returns the result as a quadruple-precision floating-point value. The
6033 | operation is performed according to the IEC/IEEE Standard for Binary
6034 | Floating-Point Arithmetic.
6035 *----------------------------------------------------------------------------*/
6037 float128 float128_round_to_int(float128 a, float_status *status)
6041 uint64_t lastBitMask, roundBitsMask;
6044 aExp = extractFloat128Exp( a );
6045 if ( 0x402F <= aExp ) {
6046 if ( 0x406F <= aExp ) {
6047 if ( ( aExp == 0x7FFF )
6048 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6050 return propagateFloat128NaN(a, a, status);
6055 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6056 roundBitsMask = lastBitMask - 1;
6058 switch (status->float_rounding_mode) {
6059 case float_round_nearest_even:
6060 if ( lastBitMask ) {
6061 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6062 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6065 if ( (int64_t) z.low < 0 ) {
6067 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6071 case float_round_ties_away:
6073 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6075 if ((int64_t) z.low < 0) {
6080 case float_round_to_zero:
6082 case float_round_up:
6083 if (!extractFloat128Sign(z)) {
6084 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6087 case float_round_down:
6088 if (extractFloat128Sign(z)) {
6089 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6095 z.low &= ~ roundBitsMask;
6098 if ( aExp < 0x3FFF ) {
6099 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6100 status->float_exception_flags |= float_flag_inexact;
6101 aSign = extractFloat128Sign( a );
6102 switch (status->float_rounding_mode) {
6103 case float_round_nearest_even:
6104 if ( ( aExp == 0x3FFE )
6105 && ( extractFloat128Frac0( a )
6106 | extractFloat128Frac1( a ) )
6108 return packFloat128( aSign, 0x3FFF, 0, 0 );
6111 case float_round_ties_away:
6112 if (aExp == 0x3FFE) {
6113 return packFloat128(aSign, 0x3FFF, 0, 0);
6116 case float_round_down:
6118 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6119 : packFloat128( 0, 0, 0, 0 );
6120 case float_round_up:
6122 aSign ? packFloat128( 1, 0, 0, 0 )
6123 : packFloat128( 0, 0x3FFF, 0, 0 );
6125 return packFloat128( aSign, 0, 0, 0 );
6128 lastBitMask <<= 0x402F - aExp;
6129 roundBitsMask = lastBitMask - 1;
6132 switch (status->float_rounding_mode) {
6133 case float_round_nearest_even:
6134 z.high += lastBitMask>>1;
6135 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6136 z.high &= ~ lastBitMask;
6139 case float_round_ties_away:
6140 z.high += lastBitMask>>1;
6142 case float_round_to_zero:
6144 case float_round_up:
6145 if (!extractFloat128Sign(z)) {
6146 z.high |= ( a.low != 0 );
6147 z.high += roundBitsMask;
6150 case float_round_down:
6151 if (extractFloat128Sign(z)) {
6152 z.high |= (a.low != 0);
6153 z.high += roundBitsMask;
6159 z.high &= ~ roundBitsMask;
6161 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6162 status->float_exception_flags |= float_flag_inexact;
6168 /*----------------------------------------------------------------------------
6169 | Returns the result of adding the absolute values of the quadruple-precision
6170 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6171 | before being returned. `zSign' is ignored if the result is a NaN.
6172 | The addition is performed according to the IEC/IEEE Standard for Binary
6173 | Floating-Point Arithmetic.
6174 *----------------------------------------------------------------------------*/
6176 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6177 float_status *status)
6179 int32_t aExp, bExp, zExp;
6180 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6183 aSig1 = extractFloat128Frac1( a );
6184 aSig0 = extractFloat128Frac0( a );
6185 aExp = extractFloat128Exp( a );
6186 bSig1 = extractFloat128Frac1( b );
6187 bSig0 = extractFloat128Frac0( b );
6188 bExp = extractFloat128Exp( b );
6189 expDiff = aExp - bExp;
6190 if ( 0 < expDiff ) {
6191 if ( aExp == 0x7FFF ) {
6192 if (aSig0 | aSig1) {
6193 return propagateFloat128NaN(a, b, status);
6201 bSig0 |= LIT64( 0x0001000000000000 );
6203 shift128ExtraRightJamming(
6204 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6207 else if ( expDiff < 0 ) {
6208 if ( bExp == 0x7FFF ) {
6209 if (bSig0 | bSig1) {
6210 return propagateFloat128NaN(a, b, status);
6212 return packFloat128( zSign, 0x7FFF, 0, 0 );
6218 aSig0 |= LIT64( 0x0001000000000000 );
6220 shift128ExtraRightJamming(
6221 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6225 if ( aExp == 0x7FFF ) {
6226 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6227 return propagateFloat128NaN(a, b, status);
6231 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6233 if (status->flush_to_zero) {
6234 if (zSig0 | zSig1) {
6235 float_raise(float_flag_output_denormal, status);
6237 return packFloat128(zSign, 0, 0, 0);
6239 return packFloat128( zSign, 0, zSig0, zSig1 );
6242 zSig0 |= LIT64( 0x0002000000000000 );
6246 aSig0 |= LIT64( 0x0001000000000000 );
6247 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6249 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6252 shift128ExtraRightJamming(
6253 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6255 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6259 /*----------------------------------------------------------------------------
6260 | Returns the result of subtracting the absolute values of the quadruple-
6261 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6262 | difference is negated before being returned. `zSign' is ignored if the
6263 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6264 | Standard for Binary Floating-Point Arithmetic.
6265 *----------------------------------------------------------------------------*/
6267 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6268 float_status *status)
6270 int32_t aExp, bExp, zExp;
6271 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6274 aSig1 = extractFloat128Frac1( a );
6275 aSig0 = extractFloat128Frac0( a );
6276 aExp = extractFloat128Exp( a );
6277 bSig1 = extractFloat128Frac1( b );
6278 bSig0 = extractFloat128Frac0( b );
6279 bExp = extractFloat128Exp( b );
6280 expDiff = aExp - bExp;
6281 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6282 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6283 if ( 0 < expDiff ) goto aExpBigger;
6284 if ( expDiff < 0 ) goto bExpBigger;
6285 if ( aExp == 0x7FFF ) {
6286 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6287 return propagateFloat128NaN(a, b, status);
6289 float_raise(float_flag_invalid, status);
6290 return float128_default_nan(status);
6296 if ( bSig0 < aSig0 ) goto aBigger;
6297 if ( aSig0 < bSig0 ) goto bBigger;
6298 if ( bSig1 < aSig1 ) goto aBigger;
6299 if ( aSig1 < bSig1 ) goto bBigger;
6300 return packFloat128(status->float_rounding_mode == float_round_down,
6303 if ( bExp == 0x7FFF ) {
6304 if (bSig0 | bSig1) {
6305 return propagateFloat128NaN(a, b, status);
6307 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6313 aSig0 |= LIT64( 0x4000000000000000 );
6315 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6316 bSig0 |= LIT64( 0x4000000000000000 );
6318 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6321 goto normalizeRoundAndPack;
6323 if ( aExp == 0x7FFF ) {
6324 if (aSig0 | aSig1) {
6325 return propagateFloat128NaN(a, b, status);
6333 bSig0 |= LIT64( 0x4000000000000000 );
6335 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6336 aSig0 |= LIT64( 0x4000000000000000 );
6338 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6340 normalizeRoundAndPack:
6342 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of adding the quadruple-precision floating-point values
6349 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6350 | for Binary Floating-Point Arithmetic.
6351 *----------------------------------------------------------------------------*/
6353 float128 float128_add(float128 a, float128 b, float_status *status)
6357 aSign = extractFloat128Sign( a );
6358 bSign = extractFloat128Sign( b );
6359 if ( aSign == bSign ) {
6360 return addFloat128Sigs(a, b, aSign, status);
6363 return subFloat128Sigs(a, b, aSign, status);
6368 /*----------------------------------------------------------------------------
6369 | Returns the result of subtracting the quadruple-precision floating-point
6370 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6371 | Standard for Binary Floating-Point Arithmetic.
6372 *----------------------------------------------------------------------------*/
6374 float128 float128_sub(float128 a, float128 b, float_status *status)
6378 aSign = extractFloat128Sign( a );
6379 bSign = extractFloat128Sign( b );
6380 if ( aSign == bSign ) {
6381 return subFloat128Sigs(a, b, aSign, status);
6384 return addFloat128Sigs(a, b, aSign, status);
6389 /*----------------------------------------------------------------------------
6390 | Returns the result of multiplying the quadruple-precision floating-point
6391 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6392 | Standard for Binary Floating-Point Arithmetic.
6393 *----------------------------------------------------------------------------*/
6395 float128 float128_mul(float128 a, float128 b, float_status *status)
6397 flag aSign, bSign, zSign;
6398 int32_t aExp, bExp, zExp;
6399 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6401 aSig1 = extractFloat128Frac1( a );
6402 aSig0 = extractFloat128Frac0( a );
6403 aExp = extractFloat128Exp( a );
6404 aSign = extractFloat128Sign( a );
6405 bSig1 = extractFloat128Frac1( b );
6406 bSig0 = extractFloat128Frac0( b );
6407 bExp = extractFloat128Exp( b );
6408 bSign = extractFloat128Sign( b );
6409 zSign = aSign ^ bSign;
6410 if ( aExp == 0x7FFF ) {
6411 if ( ( aSig0 | aSig1 )
6412 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6413 return propagateFloat128NaN(a, b, status);
6415 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6416 return packFloat128( zSign, 0x7FFF, 0, 0 );
6418 if ( bExp == 0x7FFF ) {
6419 if (bSig0 | bSig1) {
6420 return propagateFloat128NaN(a, b, status);
6422 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6424 float_raise(float_flag_invalid, status);
6425 return float128_default_nan(status);
6427 return packFloat128( zSign, 0x7FFF, 0, 0 );
6430 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6431 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6434 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6435 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6437 zExp = aExp + bExp - 0x4000;
6438 aSig0 |= LIT64( 0x0001000000000000 );
6439 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6440 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6441 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6442 zSig2 |= ( zSig3 != 0 );
6443 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6444 shift128ExtraRightJamming(
6445 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6448 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6452 /*----------------------------------------------------------------------------
6453 | Returns the result of dividing the quadruple-precision floating-point value
6454 | `a' by the corresponding value `b'. The operation is performed according to
6455 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6456 *----------------------------------------------------------------------------*/
6458 float128 float128_div(float128 a, float128 b, float_status *status)
6460 flag aSign, bSign, zSign;
6461 int32_t aExp, bExp, zExp;
6462 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6463 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6465 aSig1 = extractFloat128Frac1( a );
6466 aSig0 = extractFloat128Frac0( a );
6467 aExp = extractFloat128Exp( a );
6468 aSign = extractFloat128Sign( a );
6469 bSig1 = extractFloat128Frac1( b );
6470 bSig0 = extractFloat128Frac0( b );
6471 bExp = extractFloat128Exp( b );
6472 bSign = extractFloat128Sign( b );
6473 zSign = aSign ^ bSign;
6474 if ( aExp == 0x7FFF ) {
6475 if (aSig0 | aSig1) {
6476 return propagateFloat128NaN(a, b, status);
6478 if ( bExp == 0x7FFF ) {
6479 if (bSig0 | bSig1) {
6480 return propagateFloat128NaN(a, b, status);
6484 return packFloat128( zSign, 0x7FFF, 0, 0 );
6486 if ( bExp == 0x7FFF ) {
6487 if (bSig0 | bSig1) {
6488 return propagateFloat128NaN(a, b, status);
6490 return packFloat128( zSign, 0, 0, 0 );
6493 if ( ( bSig0 | bSig1 ) == 0 ) {
6494 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6496 float_raise(float_flag_invalid, status);
6497 return float128_default_nan(status);
6499 float_raise(float_flag_divbyzero, status);
6500 return packFloat128( zSign, 0x7FFF, 0, 0 );
6502 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6505 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6506 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6508 zExp = aExp - bExp + 0x3FFD;
6510 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6512 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6513 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6514 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6517 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6518 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6519 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6520 while ( (int64_t) rem0 < 0 ) {
6522 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6524 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6525 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6526 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6527 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6528 while ( (int64_t) rem1 < 0 ) {
6530 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6532 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6534 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6535 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6539 /*----------------------------------------------------------------------------
6540 | Returns the remainder of the quadruple-precision floating-point value `a'
6541 | with respect to the corresponding value `b'. The operation is performed
6542 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6543 *----------------------------------------------------------------------------*/
6545 float128 float128_rem(float128 a, float128 b, float_status *status)
6548 int32_t aExp, bExp, expDiff;
6549 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6550 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6553 aSig1 = extractFloat128Frac1( a );
6554 aSig0 = extractFloat128Frac0( a );
6555 aExp = extractFloat128Exp( a );
6556 aSign = extractFloat128Sign( a );
6557 bSig1 = extractFloat128Frac1( b );
6558 bSig0 = extractFloat128Frac0( b );
6559 bExp = extractFloat128Exp( b );
6560 if ( aExp == 0x7FFF ) {
6561 if ( ( aSig0 | aSig1 )
6562 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6563 return propagateFloat128NaN(a, b, status);
6567 if ( bExp == 0x7FFF ) {
6568 if (bSig0 | bSig1) {
6569 return propagateFloat128NaN(a, b, status);
6574 if ( ( bSig0 | bSig1 ) == 0 ) {
6576 float_raise(float_flag_invalid, status);
6577 return float128_default_nan(status);
6579 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6582 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6583 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6585 expDiff = aExp - bExp;
6586 if ( expDiff < -1 ) return a;
6588 aSig0 | LIT64( 0x0001000000000000 ),
6590 15 - ( expDiff < 0 ),
6595 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6596 q = le128( bSig0, bSig1, aSig0, aSig1 );
6597 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6599 while ( 0 < expDiff ) {
6600 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6601 q = ( 4 < q ) ? q - 4 : 0;
6602 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6603 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6604 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6605 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6608 if ( -64 < expDiff ) {
6609 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6610 q = ( 4 < q ) ? q - 4 : 0;
6612 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6614 if ( expDiff < 0 ) {
6615 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6618 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6620 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6621 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6624 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6625 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6628 alternateASig0 = aSig0;
6629 alternateASig1 = aSig1;
6631 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6632 } while ( 0 <= (int64_t) aSig0 );
6634 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6635 if ( ( sigMean0 < 0 )
6636 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6637 aSig0 = alternateASig0;
6638 aSig1 = alternateASig1;
6640 zSign = ( (int64_t) aSig0 < 0 );
6641 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6642 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6646 /*----------------------------------------------------------------------------
6647 | Returns the square root of the quadruple-precision floating-point value `a'.
6648 | The operation is performed according to the IEC/IEEE Standard for Binary
6649 | Floating-Point Arithmetic.
6650 *----------------------------------------------------------------------------*/
6652 float128 float128_sqrt(float128 a, float_status *status)
6656 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6657 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6659 aSig1 = extractFloat128Frac1( a );
6660 aSig0 = extractFloat128Frac0( a );
6661 aExp = extractFloat128Exp( a );
6662 aSign = extractFloat128Sign( a );
6663 if ( aExp == 0x7FFF ) {
6664 if (aSig0 | aSig1) {
6665 return propagateFloat128NaN(a, a, status);
6667 if ( ! aSign ) return a;
6671 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6673 float_raise(float_flag_invalid, status);
6674 return float128_default_nan(status);
6677 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6678 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6680 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6681 aSig0 |= LIT64( 0x0001000000000000 );
6682 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6683 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6684 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6685 doubleZSig0 = zSig0<<1;
6686 mul64To128( zSig0, zSig0, &term0, &term1 );
6687 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6688 while ( (int64_t) rem0 < 0 ) {
6691 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6693 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6694 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6695 if ( zSig1 == 0 ) zSig1 = 1;
6696 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6697 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6698 mul64To128( zSig1, zSig1, &term2, &term3 );
6699 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6700 while ( (int64_t) rem1 < 0 ) {
6702 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6704 term2 |= doubleZSig0;
6705 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6707 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6709 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6710 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6714 /*----------------------------------------------------------------------------
6715 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6716 | the corresponding value `b', and 0 otherwise. The invalid exception is
6717 | raised if either operand is a NaN. Otherwise, the comparison is performed
6718 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6721 int float128_eq(float128 a, float128 b, float_status *status)
6724 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6725 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6726 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6727 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6729 float_raise(float_flag_invalid, status);
6734 && ( ( a.high == b.high )
6736 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6741 /*----------------------------------------------------------------------------
6742 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6743 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6744 | exception is raised if either operand is a NaN. The comparison is performed
6745 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6746 *----------------------------------------------------------------------------*/
6748 int float128_le(float128 a, float128 b, float_status *status)
6752 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6753 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6754 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6755 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6757 float_raise(float_flag_invalid, status);
6760 aSign = extractFloat128Sign( a );
6761 bSign = extractFloat128Sign( b );
6762 if ( aSign != bSign ) {
6765 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6769 aSign ? le128( b.high, b.low, a.high, a.low )
6770 : le128( a.high, a.low, b.high, b.low );
6774 /*----------------------------------------------------------------------------
6775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6776 | the corresponding value `b', and 0 otherwise. The invalid exception is
6777 | raised if either operand is a NaN. The comparison is performed according
6778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6779 *----------------------------------------------------------------------------*/
6781 int float128_lt(float128 a, float128 b, float_status *status)
6785 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6786 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6787 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6788 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6790 float_raise(float_flag_invalid, status);
6793 aSign = extractFloat128Sign( a );
6794 bSign = extractFloat128Sign( b );
6795 if ( aSign != bSign ) {
6798 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6802 aSign ? lt128( b.high, b.low, a.high, a.low )
6803 : lt128( a.high, a.low, b.high, b.low );
6807 /*----------------------------------------------------------------------------
6808 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6809 | be compared, and 0 otherwise. The invalid exception is raised if either
6810 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6811 | Standard for Binary Floating-Point Arithmetic.
6812 *----------------------------------------------------------------------------*/
6814 int float128_unordered(float128 a, float128 b, float_status *status)
6816 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6817 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6818 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6819 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6821 float_raise(float_flag_invalid, status);
6827 /*----------------------------------------------------------------------------
6828 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6829 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6830 | exception. The comparison is performed according to the IEC/IEEE Standard
6831 | for Binary Floating-Point Arithmetic.
6832 *----------------------------------------------------------------------------*/
6834 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6837 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6838 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6839 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6840 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6842 if (float128_is_signaling_nan(a, status)
6843 || float128_is_signaling_nan(b, status)) {
6844 float_raise(float_flag_invalid, status);
6850 && ( ( a.high == b.high )
6852 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6857 /*----------------------------------------------------------------------------
6858 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6859 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6860 | cause an exception. Otherwise, the comparison is performed according to the
6861 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6862 *----------------------------------------------------------------------------*/
6864 int float128_le_quiet(float128 a, float128 b, float_status *status)
6868 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6869 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6870 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6871 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6873 if (float128_is_signaling_nan(a, status)
6874 || float128_is_signaling_nan(b, status)) {
6875 float_raise(float_flag_invalid, status);
6879 aSign = extractFloat128Sign( a );
6880 bSign = extractFloat128Sign( b );
6881 if ( aSign != bSign ) {
6884 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6888 aSign ? le128( b.high, b.low, a.high, a.low )
6889 : le128( a.high, a.low, b.high, b.low );
6893 /*----------------------------------------------------------------------------
6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6895 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6896 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6897 | Standard for Binary Floating-Point Arithmetic.
6898 *----------------------------------------------------------------------------*/
6900 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909 if (float128_is_signaling_nan(a, status)
6910 || float128_is_signaling_nan(b, status)) {
6911 float_raise(float_flag_invalid, status);
6915 aSign = extractFloat128Sign( a );
6916 bSign = extractFloat128Sign( b );
6917 if ( aSign != bSign ) {
6920 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6924 aSign ? lt128( b.high, b.low, a.high, a.low )
6925 : lt128( a.high, a.low, b.high, b.low );
6929 /*----------------------------------------------------------------------------
6930 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6931 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6932 | comparison is performed according to the IEC/IEEE Standard for Binary
6933 | Floating-Point Arithmetic.
6934 *----------------------------------------------------------------------------*/
6936 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6938 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6939 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6940 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6941 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6943 if (float128_is_signaling_nan(a, status)
6944 || float128_is_signaling_nan(b, status)) {
6945 float_raise(float_flag_invalid, status);
6952 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6953 int is_quiet, float_status *status)
6957 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6958 float_raise(float_flag_invalid, status);
6959 return float_relation_unordered;
6961 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6962 ( extractFloatx80Frac( a )<<1 ) ) ||
6963 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6964 ( extractFloatx80Frac( b )<<1 ) )) {
6966 floatx80_is_signaling_nan(a, status) ||
6967 floatx80_is_signaling_nan(b, status)) {
6968 float_raise(float_flag_invalid, status);
6970 return float_relation_unordered;
6972 aSign = extractFloatx80Sign( a );
6973 bSign = extractFloatx80Sign( b );
6974 if ( aSign != bSign ) {
6976 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6977 ( ( a.low | b.low ) == 0 ) ) {
6979 return float_relation_equal;
6981 return 1 - (2 * aSign);
6984 if (a.low == b.low && a.high == b.high) {
6985 return float_relation_equal;
6987 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6992 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6994 return floatx80_compare_internal(a, b, 0, status);
6997 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6999 return floatx80_compare_internal(a, b, 1, status);
7002 static inline int float128_compare_internal(float128 a, float128 b,
7003 int is_quiet, float_status *status)
7007 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7008 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7009 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7010 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7012 float128_is_signaling_nan(a, status) ||
7013 float128_is_signaling_nan(b, status)) {
7014 float_raise(float_flag_invalid, status);
7016 return float_relation_unordered;
7018 aSign = extractFloat128Sign( a );
7019 bSign = extractFloat128Sign( b );
7020 if ( aSign != bSign ) {
7021 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7023 return float_relation_equal;
7025 return 1 - (2 * aSign);
7028 if (a.low == b.low && a.high == b.high) {
7029 return float_relation_equal;
7031 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7036 int float128_compare(float128 a, float128 b, float_status *status)
7038 return float128_compare_internal(a, b, 0, status);
7041 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7043 return float128_compare_internal(a, b, 1, status);
7046 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7052 if (floatx80_invalid_encoding(a)) {
7053 float_raise(float_flag_invalid, status);
7054 return floatx80_default_nan(status);
7056 aSig = extractFloatx80Frac( a );
7057 aExp = extractFloatx80Exp( a );
7058 aSign = extractFloatx80Sign( a );
7060 if ( aExp == 0x7FFF ) {
7062 return propagateFloatx80NaN(a, a, status);
7076 } else if (n < -0x10000) {
7081 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7082 aSign, aExp, aSig, 0, status);
7085 float128 float128_scalbn(float128 a, int n, float_status *status)
7089 uint64_t aSig0, aSig1;
7091 aSig1 = extractFloat128Frac1( a );
7092 aSig0 = extractFloat128Frac0( a );
7093 aExp = extractFloat128Exp( a );
7094 aSign = extractFloat128Sign( a );
7095 if ( aExp == 0x7FFF ) {
7096 if ( aSig0 | aSig1 ) {
7097 return propagateFloat128NaN(a, a, status);
7102 aSig0 |= LIT64( 0x0001000000000000 );
7103 } else if (aSig0 == 0 && aSig1 == 0) {
7111 } else if (n < -0x10000) {
7116 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1