4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
199 float_class_qnan, /* all NaNs from here */
202 float_class_msnan, /* maybe silenced */
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
247 uint64_t roundeven_mask;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
266 static const FloatFmt float32_params = {
270 static const FloatFmt float64_params = {
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
340 part.cls = float_class_qnan;
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
393 case float_round_ties_away:
394 overflow_norm = false;
397 case float_round_to_zero:
398 overflow_norm = true;
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
431 p.cls = float_class_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
452 flags |= float_flag_inexact;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
468 case float_class_zero:
474 case float_class_inf:
480 case float_class_qnan:
481 case float_class_snan:
486 g_assert_not_reached();
489 float_raise(flags, s);
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
577 g_assert_not_reached();
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
594 (a.frac == b.frac && a.sign < b.sign))) {
597 a.cls = float_class_msnan;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
625 a.cls = float_class_dnan;
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
692 if (b.cls == float_class_zero) {
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
890 bool sign_flip = flags & float_muladd_negate_result;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
910 if (flags & float_muladd_negate_c) {
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
951 } else if (flags & float_muladd_halve_result) {
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
992 /* shift c to the same binary point as the product (124) */
995 shift128RightJamming(c_hi, c_lo,
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1027 shift128RightJamming(c_hi, c_lo,
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1053 lo = lo << (shift - 64);
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1063 if (flags & float_muladd_halve_result) {
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1232 case float_round_to_zero:
1235 case float_round_up:
1238 case float_round_down:
1242 g_assert_not_reached();
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1249 a.cls = float_class_zero;
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1262 case float_round_ties_away:
1265 case float_round_to_zero:
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1290 g_assert_not_reached();
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1343 case float_class_snan:
1344 case float_class_qnan:
1346 case float_class_inf:
1347 return p.sign ? min : max;
1348 case float_class_zero:
1350 case float_class_normal:
1351 if (p.exp < DECOMPOSED_BINARY_POINT) {
1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359 if (r < -(uint64_t) min) {
1362 s->float_exception_flags = orig_flags | float_flag_invalid;
1369 s->float_exception_flags = orig_flags | float_flag_invalid;
1374 g_assert_not_reached();
1378 #define FLOAT_TO_INT(fsz, isz) \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1383 return round_to_int_and_pack(p, s->float_rounding_mode, \
1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1389 (float ## fsz a, float_status *s) \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, float_round_to_zero, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1412 * Returns the result of converting the floating-point value `a' to
1413 * the unsigned integer format. The conversion is performed according
1414 * to the IEC/IEEE Standard for Binary Floating-Point
1415 * Arithmetic---which means in particular that the conversion is
1416 * rounded according to the current rounding mode. If `a' is a NaN,
1417 * the largest unsigned integer is returned. Otherwise, if the
1418 * conversion overflows, the largest unsigned integer is returned. If
1419 * the 'a' is negative, the result is rounded and zero is returned;
1420 * values that do not round to zero will raise the inexact exception
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1427 int orig_flags = get_float_exception_flags(s);
1428 FloatParts p = round_to_int(in, rmode, s);
1431 case float_class_snan:
1432 case float_class_qnan:
1433 s->float_exception_flags = orig_flags | float_flag_invalid;
1435 case float_class_inf:
1436 return p.sign ? 0 : max;
1437 case float_class_zero:
1439 case float_class_normal:
1443 s->float_exception_flags = orig_flags | float_flag_invalid;
1447 if (p.exp < DECOMPOSED_BINARY_POINT) {
1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1452 s->float_exception_flags = orig_flags | float_flag_invalid;
1456 /* For uint64 this will never trip, but if p.exp is too large
1457 * to shift a decomposed fraction we shall have exited via the
1461 s->float_exception_flags = orig_flags | float_flag_invalid;
1468 g_assert_not_reached();
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1478 UINT ## isz ## _MAX, s); \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1482 (float ## fsz a, float_status *s) \
1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1486 UINT ## isz ## _MAX, s); \
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1501 #undef FLOAT_TO_UINT
1504 * Integer to float conversions
1506 * Returns the result of converting the two's complement integer `a'
1507 * to the floating-point format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1515 r.cls = float_class_zero;
1517 } else if (a == (1ULL << 63)) {
1518 r.cls = float_class_normal;
1520 r.frac = DECOMPOSED_IMPLICIT_BIT;
1531 int shift = clz64(f) - 1;
1532 r.cls = float_class_normal;
1533 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534 r.frac = f << shift;
1540 float16 int64_to_float16(int64_t a, float_status *status)
1542 FloatParts pa = int_to_float(a, status);
1543 return float16_round_pack_canonical(pa, status);
1546 float16 int32_to_float16(int32_t a, float_status *status)
1548 return int64_to_float16(a, status);
1551 float16 int16_to_float16(int16_t a, float_status *status)
1553 return int64_to_float16(a, status);
1556 float32 int64_to_float32(int64_t a, float_status *status)
1558 FloatParts pa = int_to_float(a, status);
1559 return float32_round_pack_canonical(pa, status);
1562 float32 int32_to_float32(int32_t a, float_status *status)
1564 return int64_to_float32(a, status);
1567 float32 int16_to_float32(int16_t a, float_status *status)
1569 return int64_to_float32(a, status);
1572 float64 int64_to_float64(int64_t a, float_status *status)
1574 FloatParts pa = int_to_float(a, status);
1575 return float64_round_pack_canonical(pa, status);
1578 float64 int32_to_float64(int32_t a, float_status *status)
1580 return int64_to_float64(a, status);
1583 float64 int16_to_float64(int16_t a, float_status *status)
1585 return int64_to_float64(a, status);
1590 * Unsigned Integer to float conversions
1592 * Returns the result of converting the unsigned integer `a' to the
1593 * floating-point format. The conversion is performed according to the
1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1599 FloatParts r = { .sign = false};
1602 r.cls = float_class_zero;
1604 int spare_bits = clz64(a) - 1;
1605 r.cls = float_class_normal;
1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607 if (spare_bits < 0) {
1608 shift64RightJamming(a, -spare_bits, &a);
1611 r.frac = a << spare_bits;
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1620 FloatParts pa = uint_to_float(a, status);
1621 return float16_round_pack_canonical(pa, status);
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1626 return uint64_to_float16(a, status);
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1631 return uint64_to_float16(a, status);
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1636 FloatParts pa = uint_to_float(a, status);
1637 return float32_round_pack_canonical(pa, status);
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1642 return uint64_to_float32(a, status);
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1647 return uint64_to_float32(a, status);
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1652 FloatParts pa = uint_to_float(a, status);
1653 return float64_round_pack_canonical(pa, status);
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1658 return uint64_to_float64(a, status);
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1663 return uint64_to_float64(a, status);
1667 /* min() and max() functions. These can't be implemented as
1668 * 'compare and pick one input' because that would mishandle
1669 * NaNs and +0 vs -0.
1671 * minnum() and maxnum() functions. These are similar to the min()
1672 * and max() functions but if one of the arguments is a QNaN and
1673 * the other is numerical then the numerical argument is returned.
1674 * SNaNs will get quietened before being returned.
1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676 * and maxNum() operations. min() and max() are the typical min/max
1677 * semantics provided by many CPUs which predate that specification.
1679 * minnummag() and maxnummag() functions correspond to minNumMag()
1680 * and minNumMag() from the IEEE-754 2008.
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683 bool ieee, bool ismag, float_status *s)
1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1687 /* Takes two floating-point values `a' and `b', one of
1688 * which is a NaN, and returns the appropriate NaN
1689 * result. If either `a' or `b' is a signaling NaN,
1690 * the invalid exception is raised.
1692 if (is_snan(a.cls) || is_snan(b.cls)) {
1693 return pick_nan(a, b, s);
1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1700 return pick_nan(a, b, s);
1703 bool a_sign, b_sign;
1706 case float_class_normal:
1709 case float_class_inf:
1712 case float_class_zero:
1716 g_assert_not_reached();
1720 case float_class_normal:
1723 case float_class_inf:
1726 case float_class_zero:
1730 g_assert_not_reached();
1737 a_sign = b_sign = 0;
1740 if (a_sign == b_sign) {
1741 bool a_less = a_exp < b_exp;
1742 if (a_exp == b_exp) {
1743 a_less = a.frac < b.frac;
1745 return a_sign ^ a_less ^ ismin ? b : a;
1747 return a_sign ^ ismin ? b : a;
1752 #define MINMAX(sz, name, ismin, isiee, ismag) \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1760 return float ## sz ## _round_pack_canonical(pr, s); \
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1786 /* Multiply A by 2 raised to the power N. */
1787 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1789 if (unlikely(is_nan(a.cls))) {
1790 return return_nan(a, s);
1792 if (a.cls == float_class_normal) {
1798 float16 float16_scalbn(float16 a, int n, float_status *status)
1800 FloatParts pa = float16_unpack_canonical(a, status);
1801 FloatParts pr = scalbn_decomposed(pa, n, status);
1802 return float16_round_pack_canonical(pr, status);
1805 float32 float32_scalbn(float32 a, int n, float_status *status)
1807 FloatParts pa = float32_unpack_canonical(a, status);
1808 FloatParts pr = scalbn_decomposed(pa, n, status);
1809 return float32_round_pack_canonical(pr, status);
1812 float64 float64_scalbn(float64 a, int n, float_status *status)
1814 FloatParts pa = float64_unpack_canonical(a, status);
1815 FloatParts pr = scalbn_decomposed(pa, n, status);
1816 return float64_round_pack_canonical(pr, status);
1819 /*----------------------------------------------------------------------------
1820 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1821 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1822 | input. If `zSign' is 1, the input is negated before being converted to an
1823 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
1824 | is simply rounded to an integer, with the inexact exception raised if the
1825 | input cannot be represented exactly as an integer. However, if the fixed-
1826 | point input is too large, the invalid exception is raised and the largest
1827 | positive or negative integer is returned.
1828 *----------------------------------------------------------------------------*/
1830 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
1832 int8_t roundingMode;
1833 flag roundNearestEven;
1834 int8_t roundIncrement, roundBits;
1837 roundingMode = status->float_rounding_mode;
1838 roundNearestEven = ( roundingMode == float_round_nearest_even );
1839 switch (roundingMode) {
1840 case float_round_nearest_even:
1841 case float_round_ties_away:
1842 roundIncrement = 0x40;
1844 case float_round_to_zero:
1847 case float_round_up:
1848 roundIncrement = zSign ? 0 : 0x7f;
1850 case float_round_down:
1851 roundIncrement = zSign ? 0x7f : 0;
1856 roundBits = absZ & 0x7F;
1857 absZ = ( absZ + roundIncrement )>>7;
1858 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1860 if ( zSign ) z = - z;
1861 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1862 float_raise(float_flag_invalid, status);
1863 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
1866 status->float_exception_flags |= float_flag_inexact;
1872 /*----------------------------------------------------------------------------
1873 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1874 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1875 | and returns the properly rounded 64-bit integer corresponding to the input.
1876 | If `zSign' is 1, the input is negated before being converted to an integer.
1877 | Ordinarily, the fixed-point input is simply rounded to an integer, with
1878 | the inexact exception raised if the input cannot be represented exactly as
1879 | an integer. However, if the fixed-point input is too large, the invalid
1880 | exception is raised and the largest positive or negative integer is
1882 *----------------------------------------------------------------------------*/
1884 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
1885 float_status *status)
1887 int8_t roundingMode;
1888 flag roundNearestEven, increment;
1891 roundingMode = status->float_rounding_mode;
1892 roundNearestEven = ( roundingMode == float_round_nearest_even );
1893 switch (roundingMode) {
1894 case float_round_nearest_even:
1895 case float_round_ties_away:
1896 increment = ((int64_t) absZ1 < 0);
1898 case float_round_to_zero:
1901 case float_round_up:
1902 increment = !zSign && absZ1;
1904 case float_round_down:
1905 increment = zSign && absZ1;
1912 if ( absZ0 == 0 ) goto overflow;
1913 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1916 if ( zSign ) z = - z;
1917 if ( z && ( ( z < 0 ) ^ zSign ) ) {
1919 float_raise(float_flag_invalid, status);
1921 zSign ? (int64_t) LIT64( 0x8000000000000000 )
1922 : LIT64( 0x7FFFFFFFFFFFFFFF );
1925 status->float_exception_flags |= float_flag_inexact;
1931 /*----------------------------------------------------------------------------
1932 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1933 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1934 | and returns the properly rounded 64-bit unsigned integer corresponding to the
1935 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
1936 | with the inexact exception raised if the input cannot be represented exactly
1937 | as an integer. However, if the fixed-point input is too large, the invalid
1938 | exception is raised and the largest unsigned integer is returned.
1939 *----------------------------------------------------------------------------*/
1941 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
1942 uint64_t absZ1, float_status *status)
1944 int8_t roundingMode;
1945 flag roundNearestEven, increment;
1947 roundingMode = status->float_rounding_mode;
1948 roundNearestEven = (roundingMode == float_round_nearest_even);
1949 switch (roundingMode) {
1950 case float_round_nearest_even:
1951 case float_round_ties_away:
1952 increment = ((int64_t)absZ1 < 0);
1954 case float_round_to_zero:
1957 case float_round_up:
1958 increment = !zSign && absZ1;
1960 case float_round_down:
1961 increment = zSign && absZ1;
1969 float_raise(float_flag_invalid, status);
1970 return LIT64(0xFFFFFFFFFFFFFFFF);
1972 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1975 if (zSign && absZ0) {
1976 float_raise(float_flag_invalid, status);
1981 status->float_exception_flags |= float_flag_inexact;
1986 /*----------------------------------------------------------------------------
1987 | If `a' is denormal and we are in flush-to-zero mode then set the
1988 | input-denormal exception and return zero. Otherwise just return the value.
1989 *----------------------------------------------------------------------------*/
1990 float32 float32_squash_input_denormal(float32 a, float_status *status)
1992 if (status->flush_inputs_to_zero) {
1993 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
1994 float_raise(float_flag_input_denormal, status);
1995 return make_float32(float32_val(a) & 0x80000000);
2001 /*----------------------------------------------------------------------------
2002 | Normalizes the subnormal single-precision floating-point value represented
2003 | by the denormalized significand `aSig'. The normalized exponent and
2004 | significand are stored at the locations pointed to by `zExpPtr' and
2005 | `zSigPtr', respectively.
2006 *----------------------------------------------------------------------------*/
2009 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2013 shiftCount = countLeadingZeros32( aSig ) - 8;
2014 *zSigPtr = aSig<<shiftCount;
2015 *zExpPtr = 1 - shiftCount;
2019 /*----------------------------------------------------------------------------
2020 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2021 | single-precision floating-point value, returning the result. After being
2022 | shifted into the proper positions, the three fields are simply added
2023 | together to form the result. This means that any integer portion of `zSig'
2024 | will be added into the exponent. Since a properly normalized significand
2025 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2026 | than the desired result exponent whenever `zSig' is a complete, normalized
2028 *----------------------------------------------------------------------------*/
2030 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
2033 return make_float32(
2034 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
2038 /*----------------------------------------------------------------------------
2039 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2040 | and significand `zSig', and returns the proper single-precision floating-
2041 | point value corresponding to the abstract input. Ordinarily, the abstract
2042 | value is simply rounded and packed into the single-precision format, with
2043 | the inexact exception raised if the abstract input cannot be represented
2044 | exactly. However, if the abstract value is too large, the overflow and
2045 | inexact exceptions are raised and an infinity or maximal finite value is
2046 | returned. If the abstract value is too small, the input value is rounded to
2047 | a subnormal number, and the underflow and inexact exceptions are raised if
2048 | the abstract input cannot be represented exactly as a subnormal single-
2049 | precision floating-point number.
2050 | The input significand `zSig' has its binary point between bits 30
2051 | and 29, which is 7 bits to the left of the usual location. This shifted
2052 | significand must be normalized or smaller. If `zSig' is not normalized,
2053 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2054 | and it must not require rounding. In the usual case that `zSig' is
2055 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2056 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2057 | Binary Floating-Point Arithmetic.
2058 *----------------------------------------------------------------------------*/
2060 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2061 float_status *status)
2063 int8_t roundingMode;
2064 flag roundNearestEven;
2065 int8_t roundIncrement, roundBits;
2068 roundingMode = status->float_rounding_mode;
2069 roundNearestEven = ( roundingMode == float_round_nearest_even );
2070 switch (roundingMode) {
2071 case float_round_nearest_even:
2072 case float_round_ties_away:
2073 roundIncrement = 0x40;
2075 case float_round_to_zero:
2078 case float_round_up:
2079 roundIncrement = zSign ? 0 : 0x7f;
2081 case float_round_down:
2082 roundIncrement = zSign ? 0x7f : 0;
2088 roundBits = zSig & 0x7F;
2089 if ( 0xFD <= (uint16_t) zExp ) {
2090 if ( ( 0xFD < zExp )
2091 || ( ( zExp == 0xFD )
2092 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2094 float_raise(float_flag_overflow | float_flag_inexact, status);
2095 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2098 if (status->flush_to_zero) {
2099 float_raise(float_flag_output_denormal, status);
2100 return packFloat32(zSign, 0, 0);
2103 (status->float_detect_tininess
2104 == float_tininess_before_rounding)
2106 || ( zSig + roundIncrement < 0x80000000 );
2107 shift32RightJamming( zSig, - zExp, &zSig );
2109 roundBits = zSig & 0x7F;
2110 if (isTiny && roundBits) {
2111 float_raise(float_flag_underflow, status);
2116 status->float_exception_flags |= float_flag_inexact;
2118 zSig = ( zSig + roundIncrement )>>7;
2119 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2120 if ( zSig == 0 ) zExp = 0;
2121 return packFloat32( zSign, zExp, zSig );
2125 /*----------------------------------------------------------------------------
2126 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2127 | and significand `zSig', and returns the proper single-precision floating-
2128 | point value corresponding to the abstract input. This routine is just like
2129 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2130 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2131 | floating-point exponent.
2132 *----------------------------------------------------------------------------*/
2135 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2136 float_status *status)
2140 shiftCount = countLeadingZeros32( zSig ) - 1;
2141 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2146 /*----------------------------------------------------------------------------
2147 | If `a' is denormal and we are in flush-to-zero mode then set the
2148 | input-denormal exception and return zero. Otherwise just return the value.
2149 *----------------------------------------------------------------------------*/
2150 float64 float64_squash_input_denormal(float64 a, float_status *status)
2152 if (status->flush_inputs_to_zero) {
2153 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2154 float_raise(float_flag_input_denormal, status);
2155 return make_float64(float64_val(a) & (1ULL << 63));
2161 /*----------------------------------------------------------------------------
2162 | Normalizes the subnormal double-precision floating-point value represented
2163 | by the denormalized significand `aSig'. The normalized exponent and
2164 | significand are stored at the locations pointed to by `zExpPtr' and
2165 | `zSigPtr', respectively.
2166 *----------------------------------------------------------------------------*/
2169 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2173 shiftCount = countLeadingZeros64( aSig ) - 11;
2174 *zSigPtr = aSig<<shiftCount;
2175 *zExpPtr = 1 - shiftCount;
2179 /*----------------------------------------------------------------------------
2180 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2181 | double-precision floating-point value, returning the result. After being
2182 | shifted into the proper positions, the three fields are simply added
2183 | together to form the result. This means that any integer portion of `zSig'
2184 | will be added into the exponent. Since a properly normalized significand
2185 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2186 | than the desired result exponent whenever `zSig' is a complete, normalized
2188 *----------------------------------------------------------------------------*/
2190 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2193 return make_float64(
2194 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2198 /*----------------------------------------------------------------------------
2199 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2200 | and significand `zSig', and returns the proper double-precision floating-
2201 | point value corresponding to the abstract input. Ordinarily, the abstract
2202 | value is simply rounded and packed into the double-precision format, with
2203 | the inexact exception raised if the abstract input cannot be represented
2204 | exactly. However, if the abstract value is too large, the overflow and
2205 | inexact exceptions are raised and an infinity or maximal finite value is
2206 | returned. If the abstract value is too small, the input value is rounded to
2207 | a subnormal number, and the underflow and inexact exceptions are raised if
2208 | the abstract input cannot be represented exactly as a subnormal double-
2209 | precision floating-point number.
2210 | The input significand `zSig' has its binary point between bits 62
2211 | and 61, which is 10 bits to the left of the usual location. This shifted
2212 | significand must be normalized or smaller. If `zSig' is not normalized,
2213 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2214 | and it must not require rounding. In the usual case that `zSig' is
2215 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2216 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2217 | Binary Floating-Point Arithmetic.
2218 *----------------------------------------------------------------------------*/
2220 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2221 float_status *status)
2223 int8_t roundingMode;
2224 flag roundNearestEven;
2225 int roundIncrement, roundBits;
2228 roundingMode = status->float_rounding_mode;
2229 roundNearestEven = ( roundingMode == float_round_nearest_even );
2230 switch (roundingMode) {
2231 case float_round_nearest_even:
2232 case float_round_ties_away:
2233 roundIncrement = 0x200;
2235 case float_round_to_zero:
2238 case float_round_up:
2239 roundIncrement = zSign ? 0 : 0x3ff;
2241 case float_round_down:
2242 roundIncrement = zSign ? 0x3ff : 0;
2244 case float_round_to_odd:
2245 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2250 roundBits = zSig & 0x3FF;
2251 if ( 0x7FD <= (uint16_t) zExp ) {
2252 if ( ( 0x7FD < zExp )
2253 || ( ( zExp == 0x7FD )
2254 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2256 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2257 roundIncrement != 0;
2258 float_raise(float_flag_overflow | float_flag_inexact, status);
2259 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2262 if (status->flush_to_zero) {
2263 float_raise(float_flag_output_denormal, status);
2264 return packFloat64(zSign, 0, 0);
2267 (status->float_detect_tininess
2268 == float_tininess_before_rounding)
2270 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2271 shift64RightJamming( zSig, - zExp, &zSig );
2273 roundBits = zSig & 0x3FF;
2274 if (isTiny && roundBits) {
2275 float_raise(float_flag_underflow, status);
2277 if (roundingMode == float_round_to_odd) {
2279 * For round-to-odd case, the roundIncrement depends on
2280 * zSig which just changed.
2282 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2287 status->float_exception_flags |= float_flag_inexact;
2289 zSig = ( zSig + roundIncrement )>>10;
2290 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2291 if ( zSig == 0 ) zExp = 0;
2292 return packFloat64( zSign, zExp, zSig );
2296 /*----------------------------------------------------------------------------
2297 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2298 | and significand `zSig', and returns the proper double-precision floating-
2299 | point value corresponding to the abstract input. This routine is just like
2300 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2301 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2302 | floating-point exponent.
2303 *----------------------------------------------------------------------------*/
2306 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2307 float_status *status)
2311 shiftCount = countLeadingZeros64( zSig ) - 1;
2312 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2317 /*----------------------------------------------------------------------------
2318 | Returns the fraction bits of the extended double-precision floating-point
2320 *----------------------------------------------------------------------------*/
2322 static inline uint64_t extractFloatx80Frac( floatx80 a )
2329 /*----------------------------------------------------------------------------
2330 | Returns the exponent bits of the extended double-precision floating-point
2332 *----------------------------------------------------------------------------*/
2334 static inline int32_t extractFloatx80Exp( floatx80 a )
2337 return a.high & 0x7FFF;
2341 /*----------------------------------------------------------------------------
2342 | Returns the sign bit of the extended double-precision floating-point value
2344 *----------------------------------------------------------------------------*/
2346 static inline flag extractFloatx80Sign( floatx80 a )
2353 /*----------------------------------------------------------------------------
2354 | Normalizes the subnormal extended double-precision floating-point value
2355 | represented by the denormalized significand `aSig'. The normalized exponent
2356 | and significand are stored at the locations pointed to by `zExpPtr' and
2357 | `zSigPtr', respectively.
2358 *----------------------------------------------------------------------------*/
2361 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2365 shiftCount = countLeadingZeros64( aSig );
2366 *zSigPtr = aSig<<shiftCount;
2367 *zExpPtr = 1 - shiftCount;
2371 /*----------------------------------------------------------------------------
2372 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2373 | extended double-precision floating-point value, returning the result.
2374 *----------------------------------------------------------------------------*/
2376 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2381 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2386 /*----------------------------------------------------------------------------
2387 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2388 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2389 | and returns the proper extended double-precision floating-point value
2390 | corresponding to the abstract input. Ordinarily, the abstract value is
2391 | rounded and packed into the extended double-precision format, with the
2392 | inexact exception raised if the abstract input cannot be represented
2393 | exactly. However, if the abstract value is too large, the overflow and
2394 | inexact exceptions are raised and an infinity or maximal finite value is
2395 | returned. If the abstract value is too small, the input value is rounded to
2396 | a subnormal number, and the underflow and inexact exceptions are raised if
2397 | the abstract input cannot be represented exactly as a subnormal extended
2398 | double-precision floating-point number.
2399 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2400 | number of bits as single or double precision, respectively. Otherwise, the
2401 | result is rounded to the full precision of the extended double-precision
2403 | The input significand must be normalized or smaller. If the input
2404 | significand is not normalized, `zExp' must be 0; in that case, the result
2405 | returned is a subnormal number, and it must not require rounding. The
2406 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2407 | Floating-Point Arithmetic.
2408 *----------------------------------------------------------------------------*/
2410 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2411 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2412 float_status *status)
2414 int8_t roundingMode;
2415 flag roundNearestEven, increment, isTiny;
2416 int64_t roundIncrement, roundMask, roundBits;
2418 roundingMode = status->float_rounding_mode;
2419 roundNearestEven = ( roundingMode == float_round_nearest_even );
2420 if ( roundingPrecision == 80 ) goto precision80;
2421 if ( roundingPrecision == 64 ) {
2422 roundIncrement = LIT64( 0x0000000000000400 );
2423 roundMask = LIT64( 0x00000000000007FF );
2425 else if ( roundingPrecision == 32 ) {
2426 roundIncrement = LIT64( 0x0000008000000000 );
2427 roundMask = LIT64( 0x000000FFFFFFFFFF );
2432 zSig0 |= ( zSig1 != 0 );
2433 switch (roundingMode) {
2434 case float_round_nearest_even:
2435 case float_round_ties_away:
2437 case float_round_to_zero:
2440 case float_round_up:
2441 roundIncrement = zSign ? 0 : roundMask;
2443 case float_round_down:
2444 roundIncrement = zSign ? roundMask : 0;
2449 roundBits = zSig0 & roundMask;
2450 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2451 if ( ( 0x7FFE < zExp )
2452 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2457 if (status->flush_to_zero) {
2458 float_raise(float_flag_output_denormal, status);
2459 return packFloatx80(zSign, 0, 0);
2462 (status->float_detect_tininess
2463 == float_tininess_before_rounding)
2465 || ( zSig0 <= zSig0 + roundIncrement );
2466 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2468 roundBits = zSig0 & roundMask;
2469 if (isTiny && roundBits) {
2470 float_raise(float_flag_underflow, status);
2473 status->float_exception_flags |= float_flag_inexact;
2475 zSig0 += roundIncrement;
2476 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2477 roundIncrement = roundMask + 1;
2478 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2479 roundMask |= roundIncrement;
2481 zSig0 &= ~ roundMask;
2482 return packFloatx80( zSign, zExp, zSig0 );
2486 status->float_exception_flags |= float_flag_inexact;
2488 zSig0 += roundIncrement;
2489 if ( zSig0 < roundIncrement ) {
2491 zSig0 = LIT64( 0x8000000000000000 );
2493 roundIncrement = roundMask + 1;
2494 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2495 roundMask |= roundIncrement;
2497 zSig0 &= ~ roundMask;
2498 if ( zSig0 == 0 ) zExp = 0;
2499 return packFloatx80( zSign, zExp, zSig0 );
2501 switch (roundingMode) {
2502 case float_round_nearest_even:
2503 case float_round_ties_away:
2504 increment = ((int64_t)zSig1 < 0);
2506 case float_round_to_zero:
2509 case float_round_up:
2510 increment = !zSign && zSig1;
2512 case float_round_down:
2513 increment = zSign && zSig1;
2518 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2519 if ( ( 0x7FFE < zExp )
2520 || ( ( zExp == 0x7FFE )
2521 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2527 float_raise(float_flag_overflow | float_flag_inexact, status);
2528 if ( ( roundingMode == float_round_to_zero )
2529 || ( zSign && ( roundingMode == float_round_up ) )
2530 || ( ! zSign && ( roundingMode == float_round_down ) )
2532 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2534 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2538 (status->float_detect_tininess
2539 == float_tininess_before_rounding)
2542 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2543 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2545 if (isTiny && zSig1) {
2546 float_raise(float_flag_underflow, status);
2549 status->float_exception_flags |= float_flag_inexact;
2551 switch (roundingMode) {
2552 case float_round_nearest_even:
2553 case float_round_ties_away:
2554 increment = ((int64_t)zSig1 < 0);
2556 case float_round_to_zero:
2559 case float_round_up:
2560 increment = !zSign && zSig1;
2562 case float_round_down:
2563 increment = zSign && zSig1;
2571 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2572 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2574 return packFloatx80( zSign, zExp, zSig0 );
2578 status->float_exception_flags |= float_flag_inexact;
2584 zSig0 = LIT64( 0x8000000000000000 );
2587 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2591 if ( zSig0 == 0 ) zExp = 0;
2593 return packFloatx80( zSign, zExp, zSig0 );
2597 /*----------------------------------------------------------------------------
2598 | Takes an abstract floating-point value having sign `zSign', exponent
2599 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2600 | and returns the proper extended double-precision floating-point value
2601 | corresponding to the abstract input. This routine is just like
2602 | `roundAndPackFloatx80' except that the input significand does not have to be
2604 *----------------------------------------------------------------------------*/
2606 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2607 flag zSign, int32_t zExp,
2608 uint64_t zSig0, uint64_t zSig1,
2609 float_status *status)
2618 shiftCount = countLeadingZeros64( zSig0 );
2619 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2621 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2622 zSig0, zSig1, status);
2626 /*----------------------------------------------------------------------------
2627 | Returns the least-significant 64 fraction bits of the quadruple-precision
2628 | floating-point value `a'.
2629 *----------------------------------------------------------------------------*/
2631 static inline uint64_t extractFloat128Frac1( float128 a )
2638 /*----------------------------------------------------------------------------
2639 | Returns the most-significant 48 fraction bits of the quadruple-precision
2640 | floating-point value `a'.
2641 *----------------------------------------------------------------------------*/
2643 static inline uint64_t extractFloat128Frac0( float128 a )
2646 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2650 /*----------------------------------------------------------------------------
2651 | Returns the exponent bits of the quadruple-precision floating-point value
2653 *----------------------------------------------------------------------------*/
2655 static inline int32_t extractFloat128Exp( float128 a )
2658 return ( a.high>>48 ) & 0x7FFF;
2662 /*----------------------------------------------------------------------------
2663 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2664 *----------------------------------------------------------------------------*/
2666 static inline flag extractFloat128Sign( float128 a )
2673 /*----------------------------------------------------------------------------
2674 | Normalizes the subnormal quadruple-precision floating-point value
2675 | represented by the denormalized significand formed by the concatenation of
2676 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2677 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2678 | significand are stored at the location pointed to by `zSig0Ptr', and the
2679 | least significant 64 bits of the normalized significand are stored at the
2680 | location pointed to by `zSig1Ptr'.
2681 *----------------------------------------------------------------------------*/
2684 normalizeFloat128Subnormal(
2695 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2696 if ( shiftCount < 0 ) {
2697 *zSig0Ptr = aSig1>>( - shiftCount );
2698 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2701 *zSig0Ptr = aSig1<<shiftCount;
2704 *zExpPtr = - shiftCount - 63;
2707 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2708 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2709 *zExpPtr = 1 - shiftCount;
2714 /*----------------------------------------------------------------------------
2715 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2716 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2717 | floating-point value, returning the result. After being shifted into the
2718 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2719 | added together to form the most significant 32 bits of the result. This
2720 | means that any integer portion of `zSig0' will be added into the exponent.
2721 | Since a properly normalized significand will have an integer portion equal
2722 | to 1, the `zExp' input should be 1 less than the desired result exponent
2723 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2725 *----------------------------------------------------------------------------*/
2727 static inline float128
2728 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2733 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2738 /*----------------------------------------------------------------------------
2739 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2740 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2741 | and `zSig2', and returns the proper quadruple-precision floating-point value
2742 | corresponding to the abstract input. Ordinarily, the abstract value is
2743 | simply rounded and packed into the quadruple-precision format, with the
2744 | inexact exception raised if the abstract input cannot be represented
2745 | exactly. However, if the abstract value is too large, the overflow and
2746 | inexact exceptions are raised and an infinity or maximal finite value is
2747 | returned. If the abstract value is too small, the input value is rounded to
2748 | a subnormal number, and the underflow and inexact exceptions are raised if
2749 | the abstract input cannot be represented exactly as a subnormal quadruple-
2750 | precision floating-point number.
2751 | The input significand must be normalized or smaller. If the input
2752 | significand is not normalized, `zExp' must be 0; in that case, the result
2753 | returned is a subnormal number, and it must not require rounding. In the
2754 | usual case that the input significand is normalized, `zExp' must be 1 less
2755 | than the ``true'' floating-point exponent. The handling of underflow and
2756 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2757 *----------------------------------------------------------------------------*/
2759 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2760 uint64_t zSig0, uint64_t zSig1,
2761 uint64_t zSig2, float_status *status)
2763 int8_t roundingMode;
2764 flag roundNearestEven, increment, isTiny;
2766 roundingMode = status->float_rounding_mode;
2767 roundNearestEven = ( roundingMode == float_round_nearest_even );
2768 switch (roundingMode) {
2769 case float_round_nearest_even:
2770 case float_round_ties_away:
2771 increment = ((int64_t)zSig2 < 0);
2773 case float_round_to_zero:
2776 case float_round_up:
2777 increment = !zSign && zSig2;
2779 case float_round_down:
2780 increment = zSign && zSig2;
2782 case float_round_to_odd:
2783 increment = !(zSig1 & 0x1) && zSig2;
2788 if ( 0x7FFD <= (uint32_t) zExp ) {
2789 if ( ( 0x7FFD < zExp )
2790 || ( ( zExp == 0x7FFD )
2792 LIT64( 0x0001FFFFFFFFFFFF ),
2793 LIT64( 0xFFFFFFFFFFFFFFFF ),
2800 float_raise(float_flag_overflow | float_flag_inexact, status);
2801 if ( ( roundingMode == float_round_to_zero )
2802 || ( zSign && ( roundingMode == float_round_up ) )
2803 || ( ! zSign && ( roundingMode == float_round_down ) )
2804 || (roundingMode == float_round_to_odd)
2810 LIT64( 0x0000FFFFFFFFFFFF ),
2811 LIT64( 0xFFFFFFFFFFFFFFFF )
2814 return packFloat128( zSign, 0x7FFF, 0, 0 );
2817 if (status->flush_to_zero) {
2818 float_raise(float_flag_output_denormal, status);
2819 return packFloat128(zSign, 0, 0, 0);
2822 (status->float_detect_tininess
2823 == float_tininess_before_rounding)
2829 LIT64( 0x0001FFFFFFFFFFFF ),
2830 LIT64( 0xFFFFFFFFFFFFFFFF )
2832 shift128ExtraRightJamming(
2833 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2835 if (isTiny && zSig2) {
2836 float_raise(float_flag_underflow, status);
2838 switch (roundingMode) {
2839 case float_round_nearest_even:
2840 case float_round_ties_away:
2841 increment = ((int64_t)zSig2 < 0);
2843 case float_round_to_zero:
2846 case float_round_up:
2847 increment = !zSign && zSig2;
2849 case float_round_down:
2850 increment = zSign && zSig2;
2852 case float_round_to_odd:
2853 increment = !(zSig1 & 0x1) && zSig2;
2861 status->float_exception_flags |= float_flag_inexact;
2864 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2865 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2868 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2870 return packFloat128( zSign, zExp, zSig0, zSig1 );
2874 /*----------------------------------------------------------------------------
2875 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2876 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2877 | returns the proper quadruple-precision floating-point value corresponding
2878 | to the abstract input. This routine is just like `roundAndPackFloat128'
2879 | except that the input significand has fewer bits and does not have to be
2880 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2882 *----------------------------------------------------------------------------*/
2884 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2885 uint64_t zSig0, uint64_t zSig1,
2886 float_status *status)
2896 shiftCount = countLeadingZeros64( zSig0 ) - 15;
2897 if ( 0 <= shiftCount ) {
2899 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2902 shift128ExtraRightJamming(
2903 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2906 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2911 /*----------------------------------------------------------------------------
2912 | Returns the result of converting the 32-bit two's complement integer `a'
2913 | to the extended double-precision floating-point format. The conversion
2914 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2916 *----------------------------------------------------------------------------*/
2918 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2925 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2927 absA = zSign ? - a : a;
2928 shiftCount = countLeadingZeros32( absA ) + 32;
2930 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2934 /*----------------------------------------------------------------------------
2935 | Returns the result of converting the 32-bit two's complement integer `a' to
2936 | the quadruple-precision floating-point format. The conversion is performed
2937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2940 float128 int32_to_float128(int32_t a, float_status *status)
2947 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2949 absA = zSign ? - a : a;
2950 shiftCount = countLeadingZeros32( absA ) + 17;
2952 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2956 /*----------------------------------------------------------------------------
2957 | Returns the result of converting the 64-bit two's complement integer `a'
2958 | to the extended double-precision floating-point format. The conversion
2959 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2961 *----------------------------------------------------------------------------*/
2963 floatx80 int64_to_floatx80(int64_t a, float_status *status)
2969 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2971 absA = zSign ? - a : a;
2972 shiftCount = countLeadingZeros64( absA );
2973 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2977 /*----------------------------------------------------------------------------
2978 | Returns the result of converting the 64-bit two's complement integer `a' to
2979 | the quadruple-precision floating-point format. The conversion is performed
2980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2981 *----------------------------------------------------------------------------*/
2983 float128 int64_to_float128(int64_t a, float_status *status)
2989 uint64_t zSig0, zSig1;
2991 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2993 absA = zSign ? - a : a;
2994 shiftCount = countLeadingZeros64( absA ) + 49;
2995 zExp = 0x406E - shiftCount;
2996 if ( 64 <= shiftCount ) {
3005 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3006 return packFloat128( zSign, zExp, zSig0, zSig1 );
3010 /*----------------------------------------------------------------------------
3011 | Returns the result of converting the 64-bit unsigned integer `a'
3012 | to the quadruple-precision floating-point format. The conversion is performed
3013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3014 *----------------------------------------------------------------------------*/
3016 float128 uint64_to_float128(uint64_t a, float_status *status)
3019 return float128_zero;
3021 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3027 /*----------------------------------------------------------------------------
3028 | Returns the result of converting the single-precision floating-point value
3029 | `a' to the double-precision floating-point format. The conversion is
3030 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3032 *----------------------------------------------------------------------------*/
3034 float64 float32_to_float64(float32 a, float_status *status)
3039 a = float32_squash_input_denormal(a, status);
3041 aSig = extractFloat32Frac( a );
3042 aExp = extractFloat32Exp( a );
3043 aSign = extractFloat32Sign( a );
3044 if ( aExp == 0xFF ) {
3046 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3048 return packFloat64( aSign, 0x7FF, 0 );
3051 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3052 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3055 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3059 /*----------------------------------------------------------------------------
3060 | Returns the result of converting the single-precision floating-point value
3061 | `a' to the extended double-precision floating-point format. The conversion
3062 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3064 *----------------------------------------------------------------------------*/
3066 floatx80 float32_to_floatx80(float32 a, float_status *status)
3072 a = float32_squash_input_denormal(a, status);
3073 aSig = extractFloat32Frac( a );
3074 aExp = extractFloat32Exp( a );
3075 aSign = extractFloat32Sign( a );
3076 if ( aExp == 0xFF ) {
3078 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3080 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3083 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3084 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3087 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3091 /*----------------------------------------------------------------------------
3092 | Returns the result of converting the single-precision floating-point value
3093 | `a' to the double-precision floating-point format. The conversion is
3094 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3096 *----------------------------------------------------------------------------*/
3098 float128 float32_to_float128(float32 a, float_status *status)
3104 a = float32_squash_input_denormal(a, status);
3105 aSig = extractFloat32Frac( a );
3106 aExp = extractFloat32Exp( a );
3107 aSign = extractFloat32Sign( a );
3108 if ( aExp == 0xFF ) {
3110 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3112 return packFloat128( aSign, 0x7FFF, 0, 0 );
3115 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3116 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3119 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3123 /*----------------------------------------------------------------------------
3124 | Returns the remainder of the single-precision floating-point value `a'
3125 | with respect to the corresponding value `b'. The operation is performed
3126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127 *----------------------------------------------------------------------------*/
3129 float32 float32_rem(float32 a, float32 b, float_status *status)
3132 int aExp, bExp, expDiff;
3133 uint32_t aSig, bSig;
3135 uint64_t aSig64, bSig64, q64;
3136 uint32_t alternateASig;
3138 a = float32_squash_input_denormal(a, status);
3139 b = float32_squash_input_denormal(b, status);
3141 aSig = extractFloat32Frac( a );
3142 aExp = extractFloat32Exp( a );
3143 aSign = extractFloat32Sign( a );
3144 bSig = extractFloat32Frac( b );
3145 bExp = extractFloat32Exp( b );
3146 if ( aExp == 0xFF ) {
3147 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3148 return propagateFloat32NaN(a, b, status);
3150 float_raise(float_flag_invalid, status);
3151 return float32_default_nan(status);
3153 if ( bExp == 0xFF ) {
3155 return propagateFloat32NaN(a, b, status);
3161 float_raise(float_flag_invalid, status);
3162 return float32_default_nan(status);
3164 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3167 if ( aSig == 0 ) return a;
3168 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3170 expDiff = aExp - bExp;
3173 if ( expDiff < 32 ) {
3176 if ( expDiff < 0 ) {
3177 if ( expDiff < -1 ) return a;
3180 q = ( bSig <= aSig );
3181 if ( q ) aSig -= bSig;
3182 if ( 0 < expDiff ) {
3183 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3186 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3194 if ( bSig <= aSig ) aSig -= bSig;
3195 aSig64 = ( (uint64_t) aSig )<<40;
3196 bSig64 = ( (uint64_t) bSig )<<40;
3198 while ( 0 < expDiff ) {
3199 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3200 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3201 aSig64 = - ( ( bSig * q64 )<<38 );
3205 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3206 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3207 q = q64>>( 64 - expDiff );
3209 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3212 alternateASig = aSig;
3215 } while ( 0 <= (int32_t) aSig );
3216 sigMean = aSig + alternateASig;
3217 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3218 aSig = alternateASig;
3220 zSign = ( (int32_t) aSig < 0 );
3221 if ( zSign ) aSig = - aSig;
3222 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3226 /*----------------------------------------------------------------------------
3227 | Returns the square root of the single-precision floating-point value `a'.
3228 | The operation is performed according to the IEC/IEEE Standard for Binary
3229 | Floating-Point Arithmetic.
3230 *----------------------------------------------------------------------------*/
3232 float32 float32_sqrt(float32 a, float_status *status)
3236 uint32_t aSig, zSig;
3238 a = float32_squash_input_denormal(a, status);
3240 aSig = extractFloat32Frac( a );
3241 aExp = extractFloat32Exp( a );
3242 aSign = extractFloat32Sign( a );
3243 if ( aExp == 0xFF ) {
3245 return propagateFloat32NaN(a, float32_zero, status);
3247 if ( ! aSign ) return a;
3248 float_raise(float_flag_invalid, status);
3249 return float32_default_nan(status);
3252 if ( ( aExp | aSig ) == 0 ) return a;
3253 float_raise(float_flag_invalid, status);
3254 return float32_default_nan(status);
3257 if ( aSig == 0 ) return float32_zero;
3258 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3260 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3261 aSig = ( aSig | 0x00800000 )<<8;
3262 zSig = estimateSqrt32( aExp, aSig ) + 2;
3263 if ( ( zSig & 0x7F ) <= 5 ) {
3269 term = ( (uint64_t) zSig ) * zSig;
3270 rem = ( ( (uint64_t) aSig )<<32 ) - term;
3271 while ( (int64_t) rem < 0 ) {
3273 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3275 zSig |= ( rem != 0 );
3277 shift32RightJamming( zSig, 1, &zSig );
3279 return roundAndPackFloat32(0, zExp, zSig, status);
3283 /*----------------------------------------------------------------------------
3284 | Returns the binary exponential of the single-precision floating-point value
3285 | `a'. The operation is performed according to the IEC/IEEE Standard for
3286 | Binary Floating-Point Arithmetic.
3288 | Uses the following identities:
3290 | 1. -------------------------------------------------------------------------
3294 | 2. -------------------------------------------------------------------------
3297 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3299 *----------------------------------------------------------------------------*/
3301 static const float64 float32_exp2_coefficients[15] =
3303 const_float64( 0x3ff0000000000000ll ), /* 1 */
3304 const_float64( 0x3fe0000000000000ll ), /* 2 */
3305 const_float64( 0x3fc5555555555555ll ), /* 3 */
3306 const_float64( 0x3fa5555555555555ll ), /* 4 */
3307 const_float64( 0x3f81111111111111ll ), /* 5 */
3308 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3309 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3310 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3311 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3312 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3313 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3314 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3315 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3316 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3317 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3320 float32 float32_exp2(float32 a, float_status *status)
3327 a = float32_squash_input_denormal(a, status);
3329 aSig = extractFloat32Frac( a );
3330 aExp = extractFloat32Exp( a );
3331 aSign = extractFloat32Sign( a );
3333 if ( aExp == 0xFF) {
3335 return propagateFloat32NaN(a, float32_zero, status);
3337 return (aSign) ? float32_zero : a;
3340 if (aSig == 0) return float32_one;
3343 float_raise(float_flag_inexact, status);
3345 /* ******************************* */
3346 /* using float64 for approximation */
3347 /* ******************************* */
3348 x = float32_to_float64(a, status);
3349 x = float64_mul(x, float64_ln2, status);
3353 for (i = 0 ; i < 15 ; i++) {
3356 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3357 r = float64_add(r, f, status);
3359 xn = float64_mul(xn, x, status);
3362 return float64_to_float32(r, status);
3365 /*----------------------------------------------------------------------------
3366 | Returns the binary log of the single-precision floating-point value `a'.
3367 | The operation is performed according to the IEC/IEEE Standard for Binary
3368 | Floating-Point Arithmetic.
3369 *----------------------------------------------------------------------------*/
3370 float32 float32_log2(float32 a, float_status *status)
3374 uint32_t aSig, zSig, i;
3376 a = float32_squash_input_denormal(a, status);
3377 aSig = extractFloat32Frac( a );
3378 aExp = extractFloat32Exp( a );
3379 aSign = extractFloat32Sign( a );
3382 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3383 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3386 float_raise(float_flag_invalid, status);
3387 return float32_default_nan(status);
3389 if ( aExp == 0xFF ) {
3391 return propagateFloat32NaN(a, float32_zero, status);
3401 for (i = 1 << 22; i > 0; i >>= 1) {
3402 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3403 if ( aSig & 0x01000000 ) {
3412 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3415 /*----------------------------------------------------------------------------
3416 | Returns 1 if the single-precision floating-point value `a' is equal to
3417 | the corresponding value `b', and 0 otherwise. The invalid exception is
3418 | raised if either operand is a NaN. Otherwise, the comparison is performed
3419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3420 *----------------------------------------------------------------------------*/
3422 int float32_eq(float32 a, float32 b, float_status *status)
3425 a = float32_squash_input_denormal(a, status);
3426 b = float32_squash_input_denormal(b, status);
3428 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3429 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3431 float_raise(float_flag_invalid, status);
3434 av = float32_val(a);
3435 bv = float32_val(b);
3436 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3439 /*----------------------------------------------------------------------------
3440 | Returns 1 if the single-precision floating-point value `a' is less than
3441 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3442 | exception is raised if either operand is a NaN. The comparison is performed
3443 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3444 *----------------------------------------------------------------------------*/
3446 int float32_le(float32 a, float32 b, float_status *status)
3450 a = float32_squash_input_denormal(a, status);
3451 b = float32_squash_input_denormal(b, status);
3453 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3454 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3456 float_raise(float_flag_invalid, status);
3459 aSign = extractFloat32Sign( a );
3460 bSign = extractFloat32Sign( b );
3461 av = float32_val(a);
3462 bv = float32_val(b);
3463 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3464 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3468 /*----------------------------------------------------------------------------
3469 | Returns 1 if the single-precision floating-point value `a' is less than
3470 | the corresponding value `b', and 0 otherwise. The invalid exception is
3471 | raised if either operand is a NaN. The comparison is performed according
3472 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3473 *----------------------------------------------------------------------------*/
3475 int float32_lt(float32 a, float32 b, float_status *status)
3479 a = float32_squash_input_denormal(a, status);
3480 b = float32_squash_input_denormal(b, status);
3482 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3483 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3485 float_raise(float_flag_invalid, status);
3488 aSign = extractFloat32Sign( a );
3489 bSign = extractFloat32Sign( b );
3490 av = float32_val(a);
3491 bv = float32_val(b);
3492 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3493 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3497 /*----------------------------------------------------------------------------
3498 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3499 | be compared, and 0 otherwise. The invalid exception is raised if either
3500 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3501 | Standard for Binary Floating-Point Arithmetic.
3502 *----------------------------------------------------------------------------*/
3504 int float32_unordered(float32 a, float32 b, float_status *status)
3506 a = float32_squash_input_denormal(a, status);
3507 b = float32_squash_input_denormal(b, status);
3509 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3510 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3512 float_raise(float_flag_invalid, status);
3518 /*----------------------------------------------------------------------------
3519 | Returns 1 if the single-precision floating-point value `a' is equal to
3520 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3521 | exception. The comparison is performed according to the IEC/IEEE Standard
3522 | for Binary Floating-Point Arithmetic.
3523 *----------------------------------------------------------------------------*/
3525 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3527 a = float32_squash_input_denormal(a, status);
3528 b = float32_squash_input_denormal(b, status);
3530 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3531 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3533 if (float32_is_signaling_nan(a, status)
3534 || float32_is_signaling_nan(b, status)) {
3535 float_raise(float_flag_invalid, status);
3539 return ( float32_val(a) == float32_val(b) ) ||
3540 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3543 /*----------------------------------------------------------------------------
3544 | Returns 1 if the single-precision floating-point value `a' is less than or
3545 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3546 | cause an exception. Otherwise, the comparison is performed according to the
3547 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3548 *----------------------------------------------------------------------------*/
3550 int float32_le_quiet(float32 a, float32 b, float_status *status)
3554 a = float32_squash_input_denormal(a, status);
3555 b = float32_squash_input_denormal(b, status);
3557 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3558 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3560 if (float32_is_signaling_nan(a, status)
3561 || float32_is_signaling_nan(b, status)) {
3562 float_raise(float_flag_invalid, status);
3566 aSign = extractFloat32Sign( a );
3567 bSign = extractFloat32Sign( b );
3568 av = float32_val(a);
3569 bv = float32_val(b);
3570 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3571 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3575 /*----------------------------------------------------------------------------
3576 | Returns 1 if the single-precision floating-point value `a' is less than
3577 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3578 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3579 | Standard for Binary Floating-Point Arithmetic.
3580 *----------------------------------------------------------------------------*/
3582 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3586 a = float32_squash_input_denormal(a, status);
3587 b = float32_squash_input_denormal(b, status);
3589 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3590 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3592 if (float32_is_signaling_nan(a, status)
3593 || float32_is_signaling_nan(b, status)) {
3594 float_raise(float_flag_invalid, status);
3598 aSign = extractFloat32Sign( a );
3599 bSign = extractFloat32Sign( b );
3600 av = float32_val(a);
3601 bv = float32_val(b);
3602 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3603 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3607 /*----------------------------------------------------------------------------
3608 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3609 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3610 | comparison is performed according to the IEC/IEEE Standard for Binary
3611 | Floating-Point Arithmetic.
3612 *----------------------------------------------------------------------------*/
3614 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3616 a = float32_squash_input_denormal(a, status);
3617 b = float32_squash_input_denormal(b, status);
3619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3622 if (float32_is_signaling_nan(a, status)
3623 || float32_is_signaling_nan(b, status)) {
3624 float_raise(float_flag_invalid, status);
3632 /*----------------------------------------------------------------------------
3633 | Returns the result of converting the double-precision floating-point value
3634 | `a' to the single-precision floating-point format. The conversion is
3635 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3637 *----------------------------------------------------------------------------*/
3639 float32 float64_to_float32(float64 a, float_status *status)
3645 a = float64_squash_input_denormal(a, status);
3647 aSig = extractFloat64Frac( a );
3648 aExp = extractFloat64Exp( a );
3649 aSign = extractFloat64Sign( a );
3650 if ( aExp == 0x7FF ) {
3652 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3654 return packFloat32( aSign, 0xFF, 0 );
3656 shift64RightJamming( aSig, 22, &aSig );
3658 if ( aExp || zSig ) {
3662 return roundAndPackFloat32(aSign, aExp, zSig, status);
3667 /*----------------------------------------------------------------------------
3668 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3669 | half-precision floating-point value, returning the result. After being
3670 | shifted into the proper positions, the three fields are simply added
3671 | together to form the result. This means that any integer portion of `zSig'
3672 | will be added into the exponent. Since a properly normalized significand
3673 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3674 | than the desired result exponent whenever `zSig' is a complete, normalized
3676 *----------------------------------------------------------------------------*/
3677 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3679 return make_float16(
3680 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3683 /*----------------------------------------------------------------------------
3684 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3685 | and significand `zSig', and returns the proper half-precision floating-
3686 | point value corresponding to the abstract input. Ordinarily, the abstract
3687 | value is simply rounded and packed into the half-precision format, with
3688 | the inexact exception raised if the abstract input cannot be represented
3689 | exactly. However, if the abstract value is too large, the overflow and
3690 | inexact exceptions are raised and an infinity or maximal finite value is
3691 | returned. If the abstract value is too small, the input value is rounded to
3692 | a subnormal number, and the underflow and inexact exceptions are raised if
3693 | the abstract input cannot be represented exactly as a subnormal half-
3694 | precision floating-point number.
3695 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3696 | ARM-style "alternative representation", which omits the NaN and Inf
3697 | encodings in order to raise the maximum representable exponent by one.
3698 | The input significand `zSig' has its binary point between bits 22
3699 | and 23, which is 13 bits to the left of the usual location. This shifted
3700 | significand must be normalized or smaller. If `zSig' is not normalized,
3701 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3702 | and it must not require rounding. In the usual case that `zSig' is
3703 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3704 | Note the slightly odd position of the binary point in zSig compared with the
3705 | other roundAndPackFloat functions. This should probably be fixed if we
3706 | need to implement more float16 routines than just conversion.
3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3708 | Binary Floating-Point Arithmetic.
3709 *----------------------------------------------------------------------------*/
3711 static float16 roundAndPackFloat16(flag zSign, int zExp,
3712 uint32_t zSig, flag ieee,
3713 float_status *status)
3715 int maxexp = ieee ? 29 : 30;
3718 bool rounding_bumps_exp;
3719 bool is_tiny = false;
3721 /* Calculate the mask of bits of the mantissa which are not
3722 * representable in half-precision and will be lost.
3725 /* Will be denormal in halfprec */
3731 /* Normal number in halfprec */
3735 switch (status->float_rounding_mode) {
3736 case float_round_nearest_even:
3737 increment = (mask + 1) >> 1;
3738 if ((zSig & mask) == increment) {
3739 increment = zSig & (increment << 1);
3742 case float_round_ties_away:
3743 increment = (mask + 1) >> 1;
3745 case float_round_up:
3746 increment = zSign ? 0 : mask;
3748 case float_round_down:
3749 increment = zSign ? mask : 0;
3751 default: /* round_to_zero */
3756 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3758 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3760 float_raise(float_flag_overflow | float_flag_inexact, status);
3761 return packFloat16(zSign, 0x1f, 0);
3763 float_raise(float_flag_invalid, status);
3764 return packFloat16(zSign, 0x1f, 0x3ff);
3769 /* Note that flush-to-zero does not affect half-precision results */
3771 (status->float_detect_tininess == float_tininess_before_rounding)
3773 || (!rounding_bumps_exp);
3776 float_raise(float_flag_inexact, status);
3778 float_raise(float_flag_underflow, status);
3783 if (rounding_bumps_exp) {
3789 return packFloat16(zSign, 0, 0);
3795 return packFloat16(zSign, zExp, zSig >> 13);
3798 /*----------------------------------------------------------------------------
3799 | If `a' is denormal and we are in flush-to-zero mode then set the
3800 | input-denormal exception and return zero. Otherwise just return the value.
3801 *----------------------------------------------------------------------------*/
3802 float16 float16_squash_input_denormal(float16 a, float_status *status)
3804 if (status->flush_inputs_to_zero) {
3805 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3806 float_raise(float_flag_input_denormal, status);
3807 return make_float16(float16_val(a) & 0x8000);
3813 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3816 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3817 *zSigPtr = aSig << shiftCount;
3818 *zExpPtr = 1 - shiftCount;
3821 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3822 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3824 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3830 aSign = extractFloat16Sign(a);
3831 aExp = extractFloat16Exp(a);
3832 aSig = extractFloat16Frac(a);
3834 if (aExp == 0x1f && ieee) {
3836 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3838 return packFloat32(aSign, 0xff, 0);
3842 return packFloat32(aSign, 0, 0);
3845 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3848 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3851 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3857 a = float32_squash_input_denormal(a, status);
3859 aSig = extractFloat32Frac( a );
3860 aExp = extractFloat32Exp( a );
3861 aSign = extractFloat32Sign( a );
3862 if ( aExp == 0xFF ) {
3864 /* Input is a NaN */
3866 float_raise(float_flag_invalid, status);
3867 return packFloat16(aSign, 0, 0);
3869 return commonNaNToFloat16(
3870 float32ToCommonNaN(a, status), status);
3874 float_raise(float_flag_invalid, status);
3875 return packFloat16(aSign, 0x1f, 0x3ff);
3877 return packFloat16(aSign, 0x1f, 0);
3879 if (aExp == 0 && aSig == 0) {
3880 return packFloat16(aSign, 0, 0);
3882 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3883 * even if the input is denormal; however this is harmless because
3884 * the largest possible single-precision denormal is still smaller
3885 * than the smallest representable half-precision denormal, and so we
3886 * will end up ignoring aSig and returning via the "always return zero"
3892 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3895 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3901 aSign = extractFloat16Sign(a);
3902 aExp = extractFloat16Exp(a);
3903 aSig = extractFloat16Frac(a);
3905 if (aExp == 0x1f && ieee) {
3907 return commonNaNToFloat64(
3908 float16ToCommonNaN(a, status), status);
3910 return packFloat64(aSign, 0x7ff, 0);
3914 return packFloat64(aSign, 0, 0);
3917 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3920 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3923 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3930 a = float64_squash_input_denormal(a, status);
3932 aSig = extractFloat64Frac(a);
3933 aExp = extractFloat64Exp(a);
3934 aSign = extractFloat64Sign(a);
3935 if (aExp == 0x7FF) {
3937 /* Input is a NaN */
3939 float_raise(float_flag_invalid, status);
3940 return packFloat16(aSign, 0, 0);
3942 return commonNaNToFloat16(
3943 float64ToCommonNaN(a, status), status);
3947 float_raise(float_flag_invalid, status);
3948 return packFloat16(aSign, 0x1f, 0x3ff);
3950 return packFloat16(aSign, 0x1f, 0);
3952 shift64RightJamming(aSig, 29, &aSig);
3954 if (aExp == 0 && zSig == 0) {
3955 return packFloat16(aSign, 0, 0);
3957 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3958 * even if the input is denormal; however this is harmless because
3959 * the largest possible single-precision denormal is still smaller
3960 * than the smallest representable half-precision denormal, and so we
3961 * will end up ignoring aSig and returning via the "always return zero"
3967 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3970 /*----------------------------------------------------------------------------
3971 | Returns the result of converting the double-precision floating-point value
3972 | `a' to the extended double-precision floating-point format. The conversion
3973 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3975 *----------------------------------------------------------------------------*/
3977 floatx80 float64_to_floatx80(float64 a, float_status *status)
3983 a = float64_squash_input_denormal(a, status);
3984 aSig = extractFloat64Frac( a );
3985 aExp = extractFloat64Exp( a );
3986 aSign = extractFloat64Sign( a );
3987 if ( aExp == 0x7FF ) {
3989 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3991 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3994 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3995 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3999 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4003 /*----------------------------------------------------------------------------
4004 | Returns the result of converting the double-precision floating-point value
4005 | `a' to the quadruple-precision floating-point format. The conversion is
4006 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4008 *----------------------------------------------------------------------------*/
4010 float128 float64_to_float128(float64 a, float_status *status)
4014 uint64_t aSig, zSig0, zSig1;
4016 a = float64_squash_input_denormal(a, status);
4017 aSig = extractFloat64Frac( a );
4018 aExp = extractFloat64Exp( a );
4019 aSign = extractFloat64Sign( a );
4020 if ( aExp == 0x7FF ) {
4022 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4024 return packFloat128( aSign, 0x7FFF, 0, 0 );
4027 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4028 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4031 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4032 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4037 /*----------------------------------------------------------------------------
4038 | Returns the remainder of the double-precision floating-point value `a'
4039 | with respect to the corresponding value `b'. The operation is performed
4040 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4041 *----------------------------------------------------------------------------*/
4043 float64 float64_rem(float64 a, float64 b, float_status *status)
4046 int aExp, bExp, expDiff;
4047 uint64_t aSig, bSig;
4048 uint64_t q, alternateASig;
4051 a = float64_squash_input_denormal(a, status);
4052 b = float64_squash_input_denormal(b, status);
4053 aSig = extractFloat64Frac( a );
4054 aExp = extractFloat64Exp( a );
4055 aSign = extractFloat64Sign( a );
4056 bSig = extractFloat64Frac( b );
4057 bExp = extractFloat64Exp( b );
4058 if ( aExp == 0x7FF ) {
4059 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4060 return propagateFloat64NaN(a, b, status);
4062 float_raise(float_flag_invalid, status);
4063 return float64_default_nan(status);
4065 if ( bExp == 0x7FF ) {
4067 return propagateFloat64NaN(a, b, status);
4073 float_raise(float_flag_invalid, status);
4074 return float64_default_nan(status);
4076 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4079 if ( aSig == 0 ) return a;
4080 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4082 expDiff = aExp - bExp;
4083 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4084 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4085 if ( expDiff < 0 ) {
4086 if ( expDiff < -1 ) return a;
4089 q = ( bSig <= aSig );
4090 if ( q ) aSig -= bSig;
4092 while ( 0 < expDiff ) {
4093 q = estimateDiv128To64( aSig, 0, bSig );
4094 q = ( 2 < q ) ? q - 2 : 0;
4095 aSig = - ( ( bSig>>2 ) * q );
4099 if ( 0 < expDiff ) {
4100 q = estimateDiv128To64( aSig, 0, bSig );
4101 q = ( 2 < q ) ? q - 2 : 0;
4104 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4111 alternateASig = aSig;
4114 } while ( 0 <= (int64_t) aSig );
4115 sigMean = aSig + alternateASig;
4116 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4117 aSig = alternateASig;
4119 zSign = ( (int64_t) aSig < 0 );
4120 if ( zSign ) aSig = - aSig;
4121 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4126 /*----------------------------------------------------------------------------
4127 | Returns the square root of the double-precision floating-point value `a'.
4128 | The operation is performed according to the IEC/IEEE Standard for Binary
4129 | Floating-Point Arithmetic.
4130 *----------------------------------------------------------------------------*/
4132 float64 float64_sqrt(float64 a, float_status *status)
4136 uint64_t aSig, zSig, doubleZSig;
4137 uint64_t rem0, rem1, term0, term1;
4138 a = float64_squash_input_denormal(a, status);
4140 aSig = extractFloat64Frac( a );
4141 aExp = extractFloat64Exp( a );
4142 aSign = extractFloat64Sign( a );
4143 if ( aExp == 0x7FF ) {
4145 return propagateFloat64NaN(a, a, status);
4147 if ( ! aSign ) return a;
4148 float_raise(float_flag_invalid, status);
4149 return float64_default_nan(status);
4152 if ( ( aExp | aSig ) == 0 ) return a;
4153 float_raise(float_flag_invalid, status);
4154 return float64_default_nan(status);
4157 if ( aSig == 0 ) return float64_zero;
4158 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4160 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4161 aSig |= LIT64( 0x0010000000000000 );
4162 zSig = estimateSqrt32( aExp, aSig>>21 );
4163 aSig <<= 9 - ( aExp & 1 );
4164 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4165 if ( ( zSig & 0x1FF ) <= 5 ) {
4166 doubleZSig = zSig<<1;
4167 mul64To128( zSig, zSig, &term0, &term1 );
4168 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4169 while ( (int64_t) rem0 < 0 ) {
4172 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4174 zSig |= ( ( rem0 | rem1 ) != 0 );
4176 return roundAndPackFloat64(0, zExp, zSig, status);
4180 /*----------------------------------------------------------------------------
4181 | Returns the binary log of the double-precision floating-point value `a'.
4182 | The operation is performed according to the IEC/IEEE Standard for Binary
4183 | Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 float64 float64_log2(float64 a, float_status *status)
4189 uint64_t aSig, aSig0, aSig1, zSig, i;
4190 a = float64_squash_input_denormal(a, status);
4192 aSig = extractFloat64Frac( a );
4193 aExp = extractFloat64Exp( a );
4194 aSign = extractFloat64Sign( a );
4197 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4198 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4201 float_raise(float_flag_invalid, status);
4202 return float64_default_nan(status);
4204 if ( aExp == 0x7FF ) {
4206 return propagateFloat64NaN(a, float64_zero, status);
4212 aSig |= LIT64( 0x0010000000000000 );
4214 zSig = (uint64_t)aExp << 52;
4215 for (i = 1LL << 51; i > 0; i >>= 1) {
4216 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4217 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4218 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4226 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4229 /*----------------------------------------------------------------------------
4230 | Returns 1 if the double-precision floating-point value `a' is equal to the
4231 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4232 | if either operand is a NaN. Otherwise, the comparison is performed
4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 *----------------------------------------------------------------------------*/
4236 int float64_eq(float64 a, float64 b, float_status *status)
4239 a = float64_squash_input_denormal(a, status);
4240 b = float64_squash_input_denormal(b, status);
4242 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4243 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4245 float_raise(float_flag_invalid, status);
4248 av = float64_val(a);
4249 bv = float64_val(b);
4250 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the double-precision floating-point value `a' is less than or
4256 | equal to the corresponding value `b', and 0 otherwise. The invalid
4257 | exception is raised if either operand is a NaN. The comparison is performed
4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4259 *----------------------------------------------------------------------------*/
4261 int float64_le(float64 a, float64 b, float_status *status)
4265 a = float64_squash_input_denormal(a, status);
4266 b = float64_squash_input_denormal(b, status);
4268 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4269 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4271 float_raise(float_flag_invalid, status);
4274 aSign = extractFloat64Sign( a );
4275 bSign = extractFloat64Sign( b );
4276 av = float64_val(a);
4277 bv = float64_val(b);
4278 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4279 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4283 /*----------------------------------------------------------------------------
4284 | Returns 1 if the double-precision floating-point value `a' is less than
4285 | the corresponding value `b', and 0 otherwise. The invalid exception is
4286 | raised if either operand is a NaN. The comparison is performed according
4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4288 *----------------------------------------------------------------------------*/
4290 int float64_lt(float64 a, float64 b, float_status *status)
4295 a = float64_squash_input_denormal(a, status);
4296 b = float64_squash_input_denormal(b, status);
4297 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4298 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4300 float_raise(float_flag_invalid, status);
4303 aSign = extractFloat64Sign( a );
4304 bSign = extractFloat64Sign( b );
4305 av = float64_val(a);
4306 bv = float64_val(b);
4307 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4308 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4312 /*----------------------------------------------------------------------------
4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4314 | be compared, and 0 otherwise. The invalid exception is raised if either
4315 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4316 | Standard for Binary Floating-Point Arithmetic.
4317 *----------------------------------------------------------------------------*/
4319 int float64_unordered(float64 a, float64 b, float_status *status)
4321 a = float64_squash_input_denormal(a, status);
4322 b = float64_squash_input_denormal(b, status);
4324 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4325 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4327 float_raise(float_flag_invalid, status);
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the double-precision floating-point value `a' is equal to the
4335 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4336 | exception.The comparison is performed according to the IEC/IEEE Standard
4337 | for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4340 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4343 a = float64_squash_input_denormal(a, status);
4344 b = float64_squash_input_denormal(b, status);
4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4349 if (float64_is_signaling_nan(a, status)
4350 || float64_is_signaling_nan(b, status)) {
4351 float_raise(float_flag_invalid, status);
4355 av = float64_val(a);
4356 bv = float64_val(b);
4357 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4361 /*----------------------------------------------------------------------------
4362 | Returns 1 if the double-precision floating-point value `a' is less than or
4363 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4364 | cause an exception. Otherwise, the comparison is performed according to the
4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4366 *----------------------------------------------------------------------------*/
4368 int float64_le_quiet(float64 a, float64 b, float_status *status)
4372 a = float64_squash_input_denormal(a, status);
4373 b = float64_squash_input_denormal(b, status);
4375 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4376 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4378 if (float64_is_signaling_nan(a, status)
4379 || float64_is_signaling_nan(b, status)) {
4380 float_raise(float_flag_invalid, status);
4384 aSign = extractFloat64Sign( a );
4385 bSign = extractFloat64Sign( b );
4386 av = float64_val(a);
4387 bv = float64_val(b);
4388 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4389 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the double-precision floating-point value `a' is less than
4395 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4396 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4397 | Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4400 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4404 a = float64_squash_input_denormal(a, status);
4405 b = float64_squash_input_denormal(b, status);
4407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4410 if (float64_is_signaling_nan(a, status)
4411 || float64_is_signaling_nan(b, status)) {
4412 float_raise(float_flag_invalid, status);
4416 aSign = extractFloat64Sign( a );
4417 bSign = extractFloat64Sign( b );
4418 av = float64_val(a);
4419 bv = float64_val(b);
4420 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4421 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4425 /*----------------------------------------------------------------------------
4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4427 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4428 | comparison is performed according to the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4432 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4434 a = float64_squash_input_denormal(a, status);
4435 b = float64_squash_input_denormal(b, status);
4437 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4438 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4440 if (float64_is_signaling_nan(a, status)
4441 || float64_is_signaling_nan(b, status)) {
4442 float_raise(float_flag_invalid, status);
4449 /*----------------------------------------------------------------------------
4450 | Returns the result of converting the extended double-precision floating-
4451 | point value `a' to the 32-bit two's complement integer format. The
4452 | conversion is performed according to the IEC/IEEE Standard for Binary
4453 | Floating-Point Arithmetic---which means in particular that the conversion
4454 | is rounded according to the current rounding mode. If `a' is a NaN, the
4455 | largest positive integer is returned. Otherwise, if the conversion
4456 | overflows, the largest integer with the same sign as `a' is returned.
4457 *----------------------------------------------------------------------------*/
4459 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4462 int32_t aExp, shiftCount;
4465 if (floatx80_invalid_encoding(a)) {
4466 float_raise(float_flag_invalid, status);
4469 aSig = extractFloatx80Frac( a );
4470 aExp = extractFloatx80Exp( a );
4471 aSign = extractFloatx80Sign( a );
4472 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4473 shiftCount = 0x4037 - aExp;
4474 if ( shiftCount <= 0 ) shiftCount = 1;
4475 shift64RightJamming( aSig, shiftCount, &aSig );
4476 return roundAndPackInt32(aSign, aSig, status);
4480 /*----------------------------------------------------------------------------
4481 | Returns the result of converting the extended double-precision floating-
4482 | point value `a' to the 32-bit two's complement integer format. The
4483 | conversion is performed according to the IEC/IEEE Standard for Binary
4484 | Floating-Point Arithmetic, except that the conversion is always rounded
4485 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4486 | Otherwise, if the conversion overflows, the largest integer with the same
4487 | sign as `a' is returned.
4488 *----------------------------------------------------------------------------*/
4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4493 int32_t aExp, shiftCount;
4494 uint64_t aSig, savedASig;
4497 if (floatx80_invalid_encoding(a)) {
4498 float_raise(float_flag_invalid, status);
4501 aSig = extractFloatx80Frac( a );
4502 aExp = extractFloatx80Exp( a );
4503 aSign = extractFloatx80Sign( a );
4504 if ( 0x401E < aExp ) {
4505 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4508 else if ( aExp < 0x3FFF ) {
4510 status->float_exception_flags |= float_flag_inexact;
4514 shiftCount = 0x403E - aExp;
4516 aSig >>= shiftCount;
4518 if ( aSign ) z = - z;
4519 if ( ( z < 0 ) ^ aSign ) {
4521 float_raise(float_flag_invalid, status);
4522 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4524 if ( ( aSig<<shiftCount ) != savedASig ) {
4525 status->float_exception_flags |= float_flag_inexact;
4531 /*----------------------------------------------------------------------------
4532 | Returns the result of converting the extended double-precision floating-
4533 | point value `a' to the 64-bit two's complement integer format. The
4534 | conversion is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic---which means in particular that the conversion
4536 | is rounded according to the current rounding mode. If `a' is a NaN,
4537 | the largest positive integer is returned. Otherwise, if the conversion
4538 | overflows, the largest integer with the same sign as `a' is returned.
4539 *----------------------------------------------------------------------------*/
4541 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4544 int32_t aExp, shiftCount;
4545 uint64_t aSig, aSigExtra;
4547 if (floatx80_invalid_encoding(a)) {
4548 float_raise(float_flag_invalid, status);
4551 aSig = extractFloatx80Frac( a );
4552 aExp = extractFloatx80Exp( a );
4553 aSign = extractFloatx80Sign( a );
4554 shiftCount = 0x403E - aExp;
4555 if ( shiftCount <= 0 ) {
4557 float_raise(float_flag_invalid, status);
4559 || ( ( aExp == 0x7FFF )
4560 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4562 return LIT64( 0x7FFFFFFFFFFFFFFF );
4564 return (int64_t) LIT64( 0x8000000000000000 );
4569 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4571 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4575 /*----------------------------------------------------------------------------
4576 | Returns the result of converting the extended double-precision floating-
4577 | point value `a' to the 64-bit two's complement integer format. The
4578 | conversion is performed according to the IEC/IEEE Standard for Binary
4579 | Floating-Point Arithmetic, except that the conversion is always rounded
4580 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4581 | Otherwise, if the conversion overflows, the largest integer with the same
4582 | sign as `a' is returned.
4583 *----------------------------------------------------------------------------*/
4585 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4588 int32_t aExp, shiftCount;
4592 if (floatx80_invalid_encoding(a)) {
4593 float_raise(float_flag_invalid, status);
4596 aSig = extractFloatx80Frac( a );
4597 aExp = extractFloatx80Exp( a );
4598 aSign = extractFloatx80Sign( a );
4599 shiftCount = aExp - 0x403E;
4600 if ( 0 <= shiftCount ) {
4601 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4602 if ( ( a.high != 0xC03E ) || aSig ) {
4603 float_raise(float_flag_invalid, status);
4604 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4605 return LIT64( 0x7FFFFFFFFFFFFFFF );
4608 return (int64_t) LIT64( 0x8000000000000000 );
4610 else if ( aExp < 0x3FFF ) {
4612 status->float_exception_flags |= float_flag_inexact;
4616 z = aSig>>( - shiftCount );
4617 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4618 status->float_exception_flags |= float_flag_inexact;
4620 if ( aSign ) z = - z;
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of converting the extended double-precision floating-
4627 | point value `a' to the single-precision floating-point format. The
4628 | conversion is performed according to the IEC/IEEE Standard for Binary
4629 | Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4632 float32 floatx80_to_float32(floatx80 a, float_status *status)
4638 if (floatx80_invalid_encoding(a)) {
4639 float_raise(float_flag_invalid, status);
4640 return float32_default_nan(status);
4642 aSig = extractFloatx80Frac( a );
4643 aExp = extractFloatx80Exp( a );
4644 aSign = extractFloatx80Sign( a );
4645 if ( aExp == 0x7FFF ) {
4646 if ( (uint64_t) ( aSig<<1 ) ) {
4647 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4649 return packFloat32( aSign, 0xFF, 0 );
4651 shift64RightJamming( aSig, 33, &aSig );
4652 if ( aExp || aSig ) aExp -= 0x3F81;
4653 return roundAndPackFloat32(aSign, aExp, aSig, status);
4657 /*----------------------------------------------------------------------------
4658 | Returns the result of converting the extended double-precision floating-
4659 | point value `a' to the double-precision floating-point format. The
4660 | conversion is performed according to the IEC/IEEE Standard for Binary
4661 | Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4664 float64 floatx80_to_float64(floatx80 a, float_status *status)
4668 uint64_t aSig, zSig;
4670 if (floatx80_invalid_encoding(a)) {
4671 float_raise(float_flag_invalid, status);
4672 return float64_default_nan(status);
4674 aSig = extractFloatx80Frac( a );
4675 aExp = extractFloatx80Exp( a );
4676 aSign = extractFloatx80Sign( a );
4677 if ( aExp == 0x7FFF ) {
4678 if ( (uint64_t) ( aSig<<1 ) ) {
4679 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4681 return packFloat64( aSign, 0x7FF, 0 );
4683 shift64RightJamming( aSig, 1, &zSig );
4684 if ( aExp || aSig ) aExp -= 0x3C01;
4685 return roundAndPackFloat64(aSign, aExp, zSig, status);
4689 /*----------------------------------------------------------------------------
4690 | Returns the result of converting the extended double-precision floating-
4691 | point value `a' to the quadruple-precision floating-point format. The
4692 | conversion is performed according to the IEC/IEEE Standard for Binary
4693 | Floating-Point Arithmetic.
4694 *----------------------------------------------------------------------------*/
4696 float128 floatx80_to_float128(floatx80 a, float_status *status)
4700 uint64_t aSig, zSig0, zSig1;
4702 if (floatx80_invalid_encoding(a)) {
4703 float_raise(float_flag_invalid, status);
4704 return float128_default_nan(status);
4706 aSig = extractFloatx80Frac( a );
4707 aExp = extractFloatx80Exp( a );
4708 aSign = extractFloatx80Sign( a );
4709 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4710 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4712 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4713 return packFloat128( aSign, aExp, zSig0, zSig1 );
4717 /*----------------------------------------------------------------------------
4718 | Rounds the extended double-precision floating-point value `a'
4719 | to the precision provided by floatx80_rounding_precision and returns the
4720 | result as an extended double-precision floating-point value.
4721 | The operation is performed according to the IEC/IEEE Standard for Binary
4722 | Floating-Point Arithmetic.
4723 *----------------------------------------------------------------------------*/
4725 floatx80 floatx80_round(floatx80 a, float_status *status)
4727 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4728 extractFloatx80Sign(a),
4729 extractFloatx80Exp(a),
4730 extractFloatx80Frac(a), 0, status);
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a' to an integer,
4735 | and returns the result as an extended quadruple-precision floating-point
4736 | value. The operation is performed according to the IEC/IEEE Standard for
4737 | Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4740 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4744 uint64_t lastBitMask, roundBitsMask;
4747 if (floatx80_invalid_encoding(a)) {
4748 float_raise(float_flag_invalid, status);
4749 return floatx80_default_nan(status);
4751 aExp = extractFloatx80Exp( a );
4752 if ( 0x403E <= aExp ) {
4753 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4754 return propagateFloatx80NaN(a, a, status);
4758 if ( aExp < 0x3FFF ) {
4760 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4763 status->float_exception_flags |= float_flag_inexact;
4764 aSign = extractFloatx80Sign( a );
4765 switch (status->float_rounding_mode) {
4766 case float_round_nearest_even:
4767 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4770 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4773 case float_round_ties_away:
4774 if (aExp == 0x3FFE) {
4775 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4778 case float_round_down:
4781 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4782 : packFloatx80( 0, 0, 0 );
4783 case float_round_up:
4785 aSign ? packFloatx80( 1, 0, 0 )
4786 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4788 return packFloatx80( aSign, 0, 0 );
4791 lastBitMask <<= 0x403E - aExp;
4792 roundBitsMask = lastBitMask - 1;
4794 switch (status->float_rounding_mode) {
4795 case float_round_nearest_even:
4796 z.low += lastBitMask>>1;
4797 if ((z.low & roundBitsMask) == 0) {
4798 z.low &= ~lastBitMask;
4801 case float_round_ties_away:
4802 z.low += lastBitMask >> 1;
4804 case float_round_to_zero:
4806 case float_round_up:
4807 if (!extractFloatx80Sign(z)) {
4808 z.low += roundBitsMask;
4811 case float_round_down:
4812 if (extractFloatx80Sign(z)) {
4813 z.low += roundBitsMask;
4819 z.low &= ~ roundBitsMask;
4822 z.low = LIT64( 0x8000000000000000 );
4824 if (z.low != a.low) {
4825 status->float_exception_flags |= float_flag_inexact;
4831 /*----------------------------------------------------------------------------
4832 | Returns the result of adding the absolute values of the extended double-
4833 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4834 | negated before being returned. `zSign' is ignored if the result is a NaN.
4835 | The addition is performed according to the IEC/IEEE Standard for Binary
4836 | Floating-Point Arithmetic.
4837 *----------------------------------------------------------------------------*/
4839 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4840 float_status *status)
4842 int32_t aExp, bExp, zExp;
4843 uint64_t aSig, bSig, zSig0, zSig1;
4846 aSig = extractFloatx80Frac( a );
4847 aExp = extractFloatx80Exp( a );
4848 bSig = extractFloatx80Frac( b );
4849 bExp = extractFloatx80Exp( b );
4850 expDiff = aExp - bExp;
4851 if ( 0 < expDiff ) {
4852 if ( aExp == 0x7FFF ) {
4853 if ((uint64_t)(aSig << 1)) {
4854 return propagateFloatx80NaN(a, b, status);
4858 if ( bExp == 0 ) --expDiff;
4859 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4862 else if ( expDiff < 0 ) {
4863 if ( bExp == 0x7FFF ) {
4864 if ((uint64_t)(bSig << 1)) {
4865 return propagateFloatx80NaN(a, b, status);
4867 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4869 if ( aExp == 0 ) ++expDiff;
4870 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4874 if ( aExp == 0x7FFF ) {
4875 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4876 return propagateFloatx80NaN(a, b, status);
4881 zSig0 = aSig + bSig;
4883 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4889 zSig0 = aSig + bSig;
4890 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4892 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4893 zSig0 |= LIT64( 0x8000000000000000 );
4896 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4897 zSign, zExp, zSig0, zSig1, status);
4900 /*----------------------------------------------------------------------------
4901 | Returns the result of subtracting the absolute values of the extended
4902 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4903 | difference is negated before being returned. `zSign' is ignored if the
4904 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4905 | Standard for Binary Floating-Point Arithmetic.
4906 *----------------------------------------------------------------------------*/
4908 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4909 float_status *status)
4911 int32_t aExp, bExp, zExp;
4912 uint64_t aSig, bSig, zSig0, zSig1;
4915 aSig = extractFloatx80Frac( a );
4916 aExp = extractFloatx80Exp( a );
4917 bSig = extractFloatx80Frac( b );
4918 bExp = extractFloatx80Exp( b );
4919 expDiff = aExp - bExp;
4920 if ( 0 < expDiff ) goto aExpBigger;
4921 if ( expDiff < 0 ) goto bExpBigger;
4922 if ( aExp == 0x7FFF ) {
4923 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4924 return propagateFloatx80NaN(a, b, status);
4926 float_raise(float_flag_invalid, status);
4927 return floatx80_default_nan(status);
4934 if ( bSig < aSig ) goto aBigger;
4935 if ( aSig < bSig ) goto bBigger;
4936 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4938 if ( bExp == 0x7FFF ) {
4939 if ((uint64_t)(bSig << 1)) {
4940 return propagateFloatx80NaN(a, b, status);
4942 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4944 if ( aExp == 0 ) ++expDiff;
4945 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4947 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4950 goto normalizeRoundAndPack;
4952 if ( aExp == 0x7FFF ) {
4953 if ((uint64_t)(aSig << 1)) {
4954 return propagateFloatx80NaN(a, b, status);
4958 if ( bExp == 0 ) --expDiff;
4959 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4961 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4963 normalizeRoundAndPack:
4964 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4965 zSign, zExp, zSig0, zSig1, status);
4968 /*----------------------------------------------------------------------------
4969 | Returns the result of adding the extended double-precision floating-point
4970 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4971 | Standard for Binary Floating-Point Arithmetic.
4972 *----------------------------------------------------------------------------*/
4974 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4978 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4979 float_raise(float_flag_invalid, status);
4980 return floatx80_default_nan(status);
4982 aSign = extractFloatx80Sign( a );
4983 bSign = extractFloatx80Sign( b );
4984 if ( aSign == bSign ) {
4985 return addFloatx80Sigs(a, b, aSign, status);
4988 return subFloatx80Sigs(a, b, aSign, status);
4993 /*----------------------------------------------------------------------------
4994 | Returns the result of subtracting the extended double-precision floating-
4995 | point values `a' and `b'. The operation is performed according to the
4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4999 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5003 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5004 float_raise(float_flag_invalid, status);
5005 return floatx80_default_nan(status);
5007 aSign = extractFloatx80Sign( a );
5008 bSign = extractFloatx80Sign( b );
5009 if ( aSign == bSign ) {
5010 return subFloatx80Sigs(a, b, aSign, status);
5013 return addFloatx80Sigs(a, b, aSign, status);
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of multiplying the extended double-precision floating-
5020 | point values `a' and `b'. The operation is performed according to the
5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5024 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5026 flag aSign, bSign, zSign;
5027 int32_t aExp, bExp, zExp;
5028 uint64_t aSig, bSig, zSig0, zSig1;
5030 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5031 float_raise(float_flag_invalid, status);
5032 return floatx80_default_nan(status);
5034 aSig = extractFloatx80Frac( a );
5035 aExp = extractFloatx80Exp( a );
5036 aSign = extractFloatx80Sign( a );
5037 bSig = extractFloatx80Frac( b );
5038 bExp = extractFloatx80Exp( b );
5039 bSign = extractFloatx80Sign( b );
5040 zSign = aSign ^ bSign;
5041 if ( aExp == 0x7FFF ) {
5042 if ( (uint64_t) ( aSig<<1 )
5043 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5044 return propagateFloatx80NaN(a, b, status);
5046 if ( ( bExp | bSig ) == 0 ) goto invalid;
5047 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5049 if ( bExp == 0x7FFF ) {
5050 if ((uint64_t)(bSig << 1)) {
5051 return propagateFloatx80NaN(a, b, status);
5053 if ( ( aExp | aSig ) == 0 ) {
5055 float_raise(float_flag_invalid, status);
5056 return floatx80_default_nan(status);
5058 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5061 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5062 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5065 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5066 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5068 zExp = aExp + bExp - 0x3FFE;
5069 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5070 if ( 0 < (int64_t) zSig0 ) {
5071 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5074 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5075 zSign, zExp, zSig0, zSig1, status);
5078 /*----------------------------------------------------------------------------
5079 | Returns the result of dividing the extended double-precision floating-point
5080 | value `a' by the corresponding value `b'. The operation is performed
5081 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5084 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5086 flag aSign, bSign, zSign;
5087 int32_t aExp, bExp, zExp;
5088 uint64_t aSig, bSig, zSig0, zSig1;
5089 uint64_t rem0, rem1, rem2, term0, term1, term2;
5091 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5092 float_raise(float_flag_invalid, status);
5093 return floatx80_default_nan(status);
5095 aSig = extractFloatx80Frac( a );
5096 aExp = extractFloatx80Exp( a );
5097 aSign = extractFloatx80Sign( a );
5098 bSig = extractFloatx80Frac( b );
5099 bExp = extractFloatx80Exp( b );
5100 bSign = extractFloatx80Sign( b );
5101 zSign = aSign ^ bSign;
5102 if ( aExp == 0x7FFF ) {
5103 if ((uint64_t)(aSig << 1)) {
5104 return propagateFloatx80NaN(a, b, status);
5106 if ( bExp == 0x7FFF ) {
5107 if ((uint64_t)(bSig << 1)) {
5108 return propagateFloatx80NaN(a, b, status);
5112 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5114 if ( bExp == 0x7FFF ) {
5115 if ((uint64_t)(bSig << 1)) {
5116 return propagateFloatx80NaN(a, b, status);
5118 return packFloatx80( zSign, 0, 0 );
5122 if ( ( aExp | aSig ) == 0 ) {
5124 float_raise(float_flag_invalid, status);
5125 return floatx80_default_nan(status);
5127 float_raise(float_flag_divbyzero, status);
5128 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5130 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5133 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5134 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5136 zExp = aExp - bExp + 0x3FFE;
5138 if ( bSig <= aSig ) {
5139 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5142 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5143 mul64To128( bSig, zSig0, &term0, &term1 );
5144 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5145 while ( (int64_t) rem0 < 0 ) {
5147 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5149 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5150 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5151 mul64To128( bSig, zSig1, &term1, &term2 );
5152 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5153 while ( (int64_t) rem1 < 0 ) {
5155 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5157 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5159 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5160 zSign, zExp, zSig0, zSig1, status);
5163 /*----------------------------------------------------------------------------
5164 | Returns the remainder of the extended double-precision floating-point value
5165 | `a' with respect to the corresponding value `b'. The operation is performed
5166 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5167 *----------------------------------------------------------------------------*/
5169 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5172 int32_t aExp, bExp, expDiff;
5173 uint64_t aSig0, aSig1, bSig;
5174 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5176 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5177 float_raise(float_flag_invalid, status);
5178 return floatx80_default_nan(status);
5180 aSig0 = extractFloatx80Frac( a );
5181 aExp = extractFloatx80Exp( a );
5182 aSign = extractFloatx80Sign( a );
5183 bSig = extractFloatx80Frac( b );
5184 bExp = extractFloatx80Exp( b );
5185 if ( aExp == 0x7FFF ) {
5186 if ( (uint64_t) ( aSig0<<1 )
5187 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5188 return propagateFloatx80NaN(a, b, status);
5192 if ( bExp == 0x7FFF ) {
5193 if ((uint64_t)(bSig << 1)) {
5194 return propagateFloatx80NaN(a, b, status);
5201 float_raise(float_flag_invalid, status);
5202 return floatx80_default_nan(status);
5204 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5207 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5208 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5210 bSig |= LIT64( 0x8000000000000000 );
5212 expDiff = aExp - bExp;
5214 if ( expDiff < 0 ) {
5215 if ( expDiff < -1 ) return a;
5216 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5219 q = ( bSig <= aSig0 );
5220 if ( q ) aSig0 -= bSig;
5222 while ( 0 < expDiff ) {
5223 q = estimateDiv128To64( aSig0, aSig1, bSig );
5224 q = ( 2 < q ) ? q - 2 : 0;
5225 mul64To128( bSig, q, &term0, &term1 );
5226 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5227 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5231 if ( 0 < expDiff ) {
5232 q = estimateDiv128To64( aSig0, aSig1, bSig );
5233 q = ( 2 < q ) ? q - 2 : 0;
5235 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5236 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5237 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5238 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5240 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5247 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5248 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5249 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5252 aSig0 = alternateASig0;
5253 aSig1 = alternateASig1;
5257 normalizeRoundAndPackFloatx80(
5258 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5262 /*----------------------------------------------------------------------------
5263 | Returns the square root of the extended double-precision floating-point
5264 | value `a'. The operation is performed according to the IEC/IEEE Standard
5265 | for Binary Floating-Point Arithmetic.
5266 *----------------------------------------------------------------------------*/
5268 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5272 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5273 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5275 if (floatx80_invalid_encoding(a)) {
5276 float_raise(float_flag_invalid, status);
5277 return floatx80_default_nan(status);
5279 aSig0 = extractFloatx80Frac( a );
5280 aExp = extractFloatx80Exp( a );
5281 aSign = extractFloatx80Sign( a );
5282 if ( aExp == 0x7FFF ) {
5283 if ((uint64_t)(aSig0 << 1)) {
5284 return propagateFloatx80NaN(a, a, status);
5286 if ( ! aSign ) return a;
5290 if ( ( aExp | aSig0 ) == 0 ) return a;
5292 float_raise(float_flag_invalid, status);
5293 return floatx80_default_nan(status);
5296 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5297 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5299 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5300 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5301 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5302 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5303 doubleZSig0 = zSig0<<1;
5304 mul64To128( zSig0, zSig0, &term0, &term1 );
5305 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5306 while ( (int64_t) rem0 < 0 ) {
5309 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5311 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5312 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5313 if ( zSig1 == 0 ) zSig1 = 1;
5314 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5315 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5316 mul64To128( zSig1, zSig1, &term2, &term3 );
5317 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5318 while ( (int64_t) rem1 < 0 ) {
5320 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5322 term2 |= doubleZSig0;
5323 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5325 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5327 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5328 zSig0 |= doubleZSig0;
5329 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5330 0, zExp, zSig0, zSig1, status);
5333 /*----------------------------------------------------------------------------
5334 | Returns 1 if the extended double-precision floating-point value `a' is equal
5335 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5336 | raised if either operand is a NaN. Otherwise, the comparison is performed
5337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5338 *----------------------------------------------------------------------------*/
5340 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5343 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5344 || (extractFloatx80Exp(a) == 0x7FFF
5345 && (uint64_t) (extractFloatx80Frac(a) << 1))
5346 || (extractFloatx80Exp(b) == 0x7FFF
5347 && (uint64_t) (extractFloatx80Frac(b) << 1))
5349 float_raise(float_flag_invalid, status);
5354 && ( ( a.high == b.high )
5356 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5361 /*----------------------------------------------------------------------------
5362 | Returns 1 if the extended double-precision floating-point value `a' is
5363 | less than or equal to the corresponding value `b', and 0 otherwise. The
5364 | invalid exception is raised if either operand is a NaN. The comparison is
5365 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5367 *----------------------------------------------------------------------------*/
5369 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5373 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5374 || (extractFloatx80Exp(a) == 0x7FFF
5375 && (uint64_t) (extractFloatx80Frac(a) << 1))
5376 || (extractFloatx80Exp(b) == 0x7FFF
5377 && (uint64_t) (extractFloatx80Frac(b) << 1))
5379 float_raise(float_flag_invalid, status);
5382 aSign = extractFloatx80Sign( a );
5383 bSign = extractFloatx80Sign( b );
5384 if ( aSign != bSign ) {
5387 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5391 aSign ? le128( b.high, b.low, a.high, a.low )
5392 : le128( a.high, a.low, b.high, b.low );
5396 /*----------------------------------------------------------------------------
5397 | Returns 1 if the extended double-precision floating-point value `a' is
5398 | less than the corresponding value `b', and 0 otherwise. The invalid
5399 | exception is raised if either operand is a NaN. The comparison is performed
5400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 *----------------------------------------------------------------------------*/
5403 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5407 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5408 || (extractFloatx80Exp(a) == 0x7FFF
5409 && (uint64_t) (extractFloatx80Frac(a) << 1))
5410 || (extractFloatx80Exp(b) == 0x7FFF
5411 && (uint64_t) (extractFloatx80Frac(b) << 1))
5413 float_raise(float_flag_invalid, status);
5416 aSign = extractFloatx80Sign( a );
5417 bSign = extractFloatx80Sign( b );
5418 if ( aSign != bSign ) {
5421 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5425 aSign ? lt128( b.high, b.low, a.high, a.low )
5426 : lt128( a.high, a.low, b.high, b.low );
5430 /*----------------------------------------------------------------------------
5431 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5432 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5433 | either operand is a NaN. The comparison is performed according to the
5434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5436 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5438 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5439 || (extractFloatx80Exp(a) == 0x7FFF
5440 && (uint64_t) (extractFloatx80Frac(a) << 1))
5441 || (extractFloatx80Exp(b) == 0x7FFF
5442 && (uint64_t) (extractFloatx80Frac(b) << 1))
5444 float_raise(float_flag_invalid, status);
5450 /*----------------------------------------------------------------------------
5451 | Returns 1 if the extended double-precision floating-point value `a' is
5452 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5453 | cause an exception. The comparison is performed according to the IEC/IEEE
5454 | Standard for Binary Floating-Point Arithmetic.
5455 *----------------------------------------------------------------------------*/
5457 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5460 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5461 float_raise(float_flag_invalid, status);
5464 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5465 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5466 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5467 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5469 if (floatx80_is_signaling_nan(a, status)
5470 || floatx80_is_signaling_nan(b, status)) {
5471 float_raise(float_flag_invalid, status);
5477 && ( ( a.high == b.high )
5479 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5484 /*----------------------------------------------------------------------------
5485 | Returns 1 if the extended double-precision floating-point value `a' is less
5486 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5487 | do not cause an exception. Otherwise, the comparison is performed according
5488 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5489 *----------------------------------------------------------------------------*/
5491 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5495 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5496 float_raise(float_flag_invalid, status);
5499 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5500 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5501 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5502 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5504 if (floatx80_is_signaling_nan(a, status)
5505 || floatx80_is_signaling_nan(b, status)) {
5506 float_raise(float_flag_invalid, status);
5510 aSign = extractFloatx80Sign( a );
5511 bSign = extractFloatx80Sign( b );
5512 if ( aSign != bSign ) {
5515 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5519 aSign ? le128( b.high, b.low, a.high, a.low )
5520 : le128( a.high, a.low, b.high, b.low );
5524 /*----------------------------------------------------------------------------
5525 | Returns 1 if the extended double-precision floating-point value `a' is less
5526 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5527 | an exception. Otherwise, the comparison is performed according to the
5528 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5529 *----------------------------------------------------------------------------*/
5531 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5535 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5536 float_raise(float_flag_invalid, status);
5539 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5540 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5541 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5542 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5544 if (floatx80_is_signaling_nan(a, status)
5545 || floatx80_is_signaling_nan(b, status)) {
5546 float_raise(float_flag_invalid, status);
5550 aSign = extractFloatx80Sign( a );
5551 bSign = extractFloatx80Sign( b );
5552 if ( aSign != bSign ) {
5555 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5559 aSign ? lt128( b.high, b.low, a.high, a.low )
5560 : lt128( a.high, a.low, b.high, b.low );
5564 /*----------------------------------------------------------------------------
5565 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5566 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5567 | The comparison is performed according to the IEC/IEEE Standard for Binary
5568 | Floating-Point Arithmetic.
5569 *----------------------------------------------------------------------------*/
5570 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5572 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5573 float_raise(float_flag_invalid, status);
5576 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5577 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5578 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5579 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5581 if (floatx80_is_signaling_nan(a, status)
5582 || floatx80_is_signaling_nan(b, status)) {
5583 float_raise(float_flag_invalid, status);
5590 /*----------------------------------------------------------------------------
5591 | Returns the result of converting the quadruple-precision floating-point
5592 | value `a' to the 32-bit two's complement integer format. The conversion
5593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5594 | Arithmetic---which means in particular that the conversion is rounded
5595 | according to the current rounding mode. If `a' is a NaN, the largest
5596 | positive integer is returned. Otherwise, if the conversion overflows, the
5597 | largest integer with the same sign as `a' is returned.
5598 *----------------------------------------------------------------------------*/
5600 int32_t float128_to_int32(float128 a, float_status *status)
5603 int32_t aExp, shiftCount;
5604 uint64_t aSig0, aSig1;
5606 aSig1 = extractFloat128Frac1( a );
5607 aSig0 = extractFloat128Frac0( a );
5608 aExp = extractFloat128Exp( a );
5609 aSign = extractFloat128Sign( a );
5610 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5611 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5612 aSig0 |= ( aSig1 != 0 );
5613 shiftCount = 0x4028 - aExp;
5614 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5615 return roundAndPackInt32(aSign, aSig0, status);
5619 /*----------------------------------------------------------------------------
5620 | Returns the result of converting the quadruple-precision floating-point
5621 | value `a' to the 32-bit two's complement integer format. The conversion
5622 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5623 | Arithmetic, except that the conversion is always rounded toward zero. If
5624 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5625 | conversion overflows, the largest integer with the same sign as `a' is
5627 *----------------------------------------------------------------------------*/
5629 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5632 int32_t aExp, shiftCount;
5633 uint64_t aSig0, aSig1, savedASig;
5636 aSig1 = extractFloat128Frac1( a );
5637 aSig0 = extractFloat128Frac0( a );
5638 aExp = extractFloat128Exp( a );
5639 aSign = extractFloat128Sign( a );
5640 aSig0 |= ( aSig1 != 0 );
5641 if ( 0x401E < aExp ) {
5642 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5645 else if ( aExp < 0x3FFF ) {
5646 if (aExp || aSig0) {
5647 status->float_exception_flags |= float_flag_inexact;
5651 aSig0 |= LIT64( 0x0001000000000000 );
5652 shiftCount = 0x402F - aExp;
5654 aSig0 >>= shiftCount;
5656 if ( aSign ) z = - z;
5657 if ( ( z < 0 ) ^ aSign ) {
5659 float_raise(float_flag_invalid, status);
5660 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5662 if ( ( aSig0<<shiftCount ) != savedASig ) {
5663 status->float_exception_flags |= float_flag_inexact;
5669 /*----------------------------------------------------------------------------
5670 | Returns the result of converting the quadruple-precision floating-point
5671 | value `a' to the 64-bit two's complement integer format. The conversion
5672 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5673 | Arithmetic---which means in particular that the conversion is rounded
5674 | according to the current rounding mode. If `a' is a NaN, the largest
5675 | positive integer is returned. Otherwise, if the conversion overflows, the
5676 | largest integer with the same sign as `a' is returned.
5677 *----------------------------------------------------------------------------*/
5679 int64_t float128_to_int64(float128 a, float_status *status)
5682 int32_t aExp, shiftCount;
5683 uint64_t aSig0, aSig1;
5685 aSig1 = extractFloat128Frac1( a );
5686 aSig0 = extractFloat128Frac0( a );
5687 aExp = extractFloat128Exp( a );
5688 aSign = extractFloat128Sign( a );
5689 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5690 shiftCount = 0x402F - aExp;
5691 if ( shiftCount <= 0 ) {
5692 if ( 0x403E < aExp ) {
5693 float_raise(float_flag_invalid, status);
5695 || ( ( aExp == 0x7FFF )
5696 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5699 return LIT64( 0x7FFFFFFFFFFFFFFF );
5701 return (int64_t) LIT64( 0x8000000000000000 );
5703 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5706 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5708 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5712 /*----------------------------------------------------------------------------
5713 | Returns the result of converting the quadruple-precision floating-point
5714 | value `a' to the 64-bit two's complement integer format. The conversion
5715 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5716 | Arithmetic, except that the conversion is always rounded toward zero.
5717 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5718 | the conversion overflows, the largest integer with the same sign as `a' is
5720 *----------------------------------------------------------------------------*/
5722 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5725 int32_t aExp, shiftCount;
5726 uint64_t aSig0, aSig1;
5729 aSig1 = extractFloat128Frac1( a );
5730 aSig0 = extractFloat128Frac0( a );
5731 aExp = extractFloat128Exp( a );
5732 aSign = extractFloat128Sign( a );
5733 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5734 shiftCount = aExp - 0x402F;
5735 if ( 0 < shiftCount ) {
5736 if ( 0x403E <= aExp ) {
5737 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5738 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5739 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5741 status->float_exception_flags |= float_flag_inexact;
5745 float_raise(float_flag_invalid, status);
5746 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5747 return LIT64( 0x7FFFFFFFFFFFFFFF );
5750 return (int64_t) LIT64( 0x8000000000000000 );
5752 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5753 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5754 status->float_exception_flags |= float_flag_inexact;
5758 if ( aExp < 0x3FFF ) {
5759 if ( aExp | aSig0 | aSig1 ) {
5760 status->float_exception_flags |= float_flag_inexact;
5764 z = aSig0>>( - shiftCount );
5766 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5767 status->float_exception_flags |= float_flag_inexact;
5770 if ( aSign ) z = - z;
5775 /*----------------------------------------------------------------------------
5776 | Returns the result of converting the quadruple-precision floating-point value
5777 | `a' to the 64-bit unsigned integer format. The conversion is
5778 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5779 | Arithmetic---which means in particular that the conversion is rounded
5780 | according to the current rounding mode. If `a' is a NaN, the largest
5781 | positive integer is returned. If the conversion overflows, the
5782 | largest unsigned integer is returned. If 'a' is negative, the value is
5783 | rounded and zero is returned; negative values that do not round to zero
5784 | will raise the inexact exception.
5785 *----------------------------------------------------------------------------*/
5787 uint64_t float128_to_uint64(float128 a, float_status *status)
5792 uint64_t aSig0, aSig1;
5794 aSig0 = extractFloat128Frac0(a);
5795 aSig1 = extractFloat128Frac1(a);
5796 aExp = extractFloat128Exp(a);
5797 aSign = extractFloat128Sign(a);
5798 if (aSign && (aExp > 0x3FFE)) {
5799 float_raise(float_flag_invalid, status);
5800 if (float128_is_any_nan(a)) {
5801 return LIT64(0xFFFFFFFFFFFFFFFF);
5807 aSig0 |= LIT64(0x0001000000000000);
5809 shiftCount = 0x402F - aExp;
5810 if (shiftCount <= 0) {
5811 if (0x403E < aExp) {
5812 float_raise(float_flag_invalid, status);
5813 return LIT64(0xFFFFFFFFFFFFFFFF);
5815 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5817 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5819 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5822 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5825 signed char current_rounding_mode = status->float_rounding_mode;
5827 set_float_rounding_mode(float_round_to_zero, status);
5828 v = float128_to_uint64(a, status);
5829 set_float_rounding_mode(current_rounding_mode, status);
5834 /*----------------------------------------------------------------------------
5835 | Returns the result of converting the quadruple-precision floating-point
5836 | value `a' to the 32-bit unsigned integer format. The conversion
5837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5838 | Arithmetic except that the conversion is always rounded toward zero.
5839 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5840 | if the conversion overflows, the largest unsigned integer is returned.
5841 | If 'a' is negative, the value is rounded and zero is returned; negative
5842 | values that do not round to zero will raise the inexact exception.
5843 *----------------------------------------------------------------------------*/
5845 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5849 int old_exc_flags = get_float_exception_flags(status);
5851 v = float128_to_uint64_round_to_zero(a, status);
5852 if (v > 0xffffffff) {
5857 set_float_exception_flags(old_exc_flags, status);
5858 float_raise(float_flag_invalid, status);
5862 /*----------------------------------------------------------------------------
5863 | Returns the result of converting the quadruple-precision floating-point
5864 | value `a' to the single-precision floating-point format. The conversion
5865 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5867 *----------------------------------------------------------------------------*/
5869 float32 float128_to_float32(float128 a, float_status *status)
5873 uint64_t aSig0, aSig1;
5876 aSig1 = extractFloat128Frac1( a );
5877 aSig0 = extractFloat128Frac0( a );
5878 aExp = extractFloat128Exp( a );
5879 aSign = extractFloat128Sign( a );
5880 if ( aExp == 0x7FFF ) {
5881 if ( aSig0 | aSig1 ) {
5882 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5884 return packFloat32( aSign, 0xFF, 0 );
5886 aSig0 |= ( aSig1 != 0 );
5887 shift64RightJamming( aSig0, 18, &aSig0 );
5889 if ( aExp || zSig ) {
5893 return roundAndPackFloat32(aSign, aExp, zSig, status);
5897 /*----------------------------------------------------------------------------
5898 | Returns the result of converting the quadruple-precision floating-point
5899 | value `a' to the double-precision floating-point format. The conversion
5900 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5902 *----------------------------------------------------------------------------*/
5904 float64 float128_to_float64(float128 a, float_status *status)
5908 uint64_t aSig0, aSig1;
5910 aSig1 = extractFloat128Frac1( a );
5911 aSig0 = extractFloat128Frac0( a );
5912 aExp = extractFloat128Exp( a );
5913 aSign = extractFloat128Sign( a );
5914 if ( aExp == 0x7FFF ) {
5915 if ( aSig0 | aSig1 ) {
5916 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5918 return packFloat64( aSign, 0x7FF, 0 );
5920 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5921 aSig0 |= ( aSig1 != 0 );
5922 if ( aExp || aSig0 ) {
5923 aSig0 |= LIT64( 0x4000000000000000 );
5926 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5930 /*----------------------------------------------------------------------------
5931 | Returns the result of converting the quadruple-precision floating-point
5932 | value `a' to the extended double-precision floating-point format. The
5933 | conversion is performed according to the IEC/IEEE Standard for Binary
5934 | Floating-Point Arithmetic.
5935 *----------------------------------------------------------------------------*/
5937 floatx80 float128_to_floatx80(float128 a, float_status *status)
5941 uint64_t aSig0, aSig1;
5943 aSig1 = extractFloat128Frac1( a );
5944 aSig0 = extractFloat128Frac0( a );
5945 aExp = extractFloat128Exp( a );
5946 aSign = extractFloat128Sign( a );
5947 if ( aExp == 0x7FFF ) {
5948 if ( aSig0 | aSig1 ) {
5949 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5951 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5954 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5955 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5958 aSig0 |= LIT64( 0x0001000000000000 );
5960 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5961 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5965 /*----------------------------------------------------------------------------
5966 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5967 | returns the result as a quadruple-precision floating-point value. The
5968 | operation is performed according to the IEC/IEEE Standard for Binary
5969 | Floating-Point Arithmetic.
5970 *----------------------------------------------------------------------------*/
5972 float128 float128_round_to_int(float128 a, float_status *status)
5976 uint64_t lastBitMask, roundBitsMask;
5979 aExp = extractFloat128Exp( a );
5980 if ( 0x402F <= aExp ) {
5981 if ( 0x406F <= aExp ) {
5982 if ( ( aExp == 0x7FFF )
5983 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5985 return propagateFloat128NaN(a, a, status);
5990 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5991 roundBitsMask = lastBitMask - 1;
5993 switch (status->float_rounding_mode) {
5994 case float_round_nearest_even:
5995 if ( lastBitMask ) {
5996 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5997 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6000 if ( (int64_t) z.low < 0 ) {
6002 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6006 case float_round_ties_away:
6008 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6010 if ((int64_t) z.low < 0) {
6015 case float_round_to_zero:
6017 case float_round_up:
6018 if (!extractFloat128Sign(z)) {
6019 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6022 case float_round_down:
6023 if (extractFloat128Sign(z)) {
6024 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6030 z.low &= ~ roundBitsMask;
6033 if ( aExp < 0x3FFF ) {
6034 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6035 status->float_exception_flags |= float_flag_inexact;
6036 aSign = extractFloat128Sign( a );
6037 switch (status->float_rounding_mode) {
6038 case float_round_nearest_even:
6039 if ( ( aExp == 0x3FFE )
6040 && ( extractFloat128Frac0( a )
6041 | extractFloat128Frac1( a ) )
6043 return packFloat128( aSign, 0x3FFF, 0, 0 );
6046 case float_round_ties_away:
6047 if (aExp == 0x3FFE) {
6048 return packFloat128(aSign, 0x3FFF, 0, 0);
6051 case float_round_down:
6053 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6054 : packFloat128( 0, 0, 0, 0 );
6055 case float_round_up:
6057 aSign ? packFloat128( 1, 0, 0, 0 )
6058 : packFloat128( 0, 0x3FFF, 0, 0 );
6060 return packFloat128( aSign, 0, 0, 0 );
6063 lastBitMask <<= 0x402F - aExp;
6064 roundBitsMask = lastBitMask - 1;
6067 switch (status->float_rounding_mode) {
6068 case float_round_nearest_even:
6069 z.high += lastBitMask>>1;
6070 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6071 z.high &= ~ lastBitMask;
6074 case float_round_ties_away:
6075 z.high += lastBitMask>>1;
6077 case float_round_to_zero:
6079 case float_round_up:
6080 if (!extractFloat128Sign(z)) {
6081 z.high |= ( a.low != 0 );
6082 z.high += roundBitsMask;
6085 case float_round_down:
6086 if (extractFloat128Sign(z)) {
6087 z.high |= (a.low != 0);
6088 z.high += roundBitsMask;
6094 z.high &= ~ roundBitsMask;
6096 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6097 status->float_exception_flags |= float_flag_inexact;
6103 /*----------------------------------------------------------------------------
6104 | Returns the result of adding the absolute values of the quadruple-precision
6105 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6106 | before being returned. `zSign' is ignored if the result is a NaN.
6107 | The addition is performed according to the IEC/IEEE Standard for Binary
6108 | Floating-Point Arithmetic.
6109 *----------------------------------------------------------------------------*/
6111 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6112 float_status *status)
6114 int32_t aExp, bExp, zExp;
6115 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6118 aSig1 = extractFloat128Frac1( a );
6119 aSig0 = extractFloat128Frac0( a );
6120 aExp = extractFloat128Exp( a );
6121 bSig1 = extractFloat128Frac1( b );
6122 bSig0 = extractFloat128Frac0( b );
6123 bExp = extractFloat128Exp( b );
6124 expDiff = aExp - bExp;
6125 if ( 0 < expDiff ) {
6126 if ( aExp == 0x7FFF ) {
6127 if (aSig0 | aSig1) {
6128 return propagateFloat128NaN(a, b, status);
6136 bSig0 |= LIT64( 0x0001000000000000 );
6138 shift128ExtraRightJamming(
6139 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6142 else if ( expDiff < 0 ) {
6143 if ( bExp == 0x7FFF ) {
6144 if (bSig0 | bSig1) {
6145 return propagateFloat128NaN(a, b, status);
6147 return packFloat128( zSign, 0x7FFF, 0, 0 );
6153 aSig0 |= LIT64( 0x0001000000000000 );
6155 shift128ExtraRightJamming(
6156 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6160 if ( aExp == 0x7FFF ) {
6161 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6162 return propagateFloat128NaN(a, b, status);
6166 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6168 if (status->flush_to_zero) {
6169 if (zSig0 | zSig1) {
6170 float_raise(float_flag_output_denormal, status);
6172 return packFloat128(zSign, 0, 0, 0);
6174 return packFloat128( zSign, 0, zSig0, zSig1 );
6177 zSig0 |= LIT64( 0x0002000000000000 );
6181 aSig0 |= LIT64( 0x0001000000000000 );
6182 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6184 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6187 shift128ExtraRightJamming(
6188 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6190 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6194 /*----------------------------------------------------------------------------
6195 | Returns the result of subtracting the absolute values of the quadruple-
6196 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6197 | difference is negated before being returned. `zSign' is ignored if the
6198 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6199 | Standard for Binary Floating-Point Arithmetic.
6200 *----------------------------------------------------------------------------*/
6202 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6203 float_status *status)
6205 int32_t aExp, bExp, zExp;
6206 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6209 aSig1 = extractFloat128Frac1( a );
6210 aSig0 = extractFloat128Frac0( a );
6211 aExp = extractFloat128Exp( a );
6212 bSig1 = extractFloat128Frac1( b );
6213 bSig0 = extractFloat128Frac0( b );
6214 bExp = extractFloat128Exp( b );
6215 expDiff = aExp - bExp;
6216 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6217 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6218 if ( 0 < expDiff ) goto aExpBigger;
6219 if ( expDiff < 0 ) goto bExpBigger;
6220 if ( aExp == 0x7FFF ) {
6221 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6222 return propagateFloat128NaN(a, b, status);
6224 float_raise(float_flag_invalid, status);
6225 return float128_default_nan(status);
6231 if ( bSig0 < aSig0 ) goto aBigger;
6232 if ( aSig0 < bSig0 ) goto bBigger;
6233 if ( bSig1 < aSig1 ) goto aBigger;
6234 if ( aSig1 < bSig1 ) goto bBigger;
6235 return packFloat128(status->float_rounding_mode == float_round_down,
6238 if ( bExp == 0x7FFF ) {
6239 if (bSig0 | bSig1) {
6240 return propagateFloat128NaN(a, b, status);
6242 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6248 aSig0 |= LIT64( 0x4000000000000000 );
6250 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6251 bSig0 |= LIT64( 0x4000000000000000 );
6253 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6256 goto normalizeRoundAndPack;
6258 if ( aExp == 0x7FFF ) {
6259 if (aSig0 | aSig1) {
6260 return propagateFloat128NaN(a, b, status);
6268 bSig0 |= LIT64( 0x4000000000000000 );
6270 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6271 aSig0 |= LIT64( 0x4000000000000000 );
6273 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6275 normalizeRoundAndPack:
6277 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6282 /*----------------------------------------------------------------------------
6283 | Returns the result of adding the quadruple-precision floating-point values
6284 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6285 | for Binary Floating-Point Arithmetic.
6286 *----------------------------------------------------------------------------*/
6288 float128 float128_add(float128 a, float128 b, float_status *status)
6292 aSign = extractFloat128Sign( a );
6293 bSign = extractFloat128Sign( b );
6294 if ( aSign == bSign ) {
6295 return addFloat128Sigs(a, b, aSign, status);
6298 return subFloat128Sigs(a, b, aSign, status);
6303 /*----------------------------------------------------------------------------
6304 | Returns the result of subtracting the quadruple-precision floating-point
6305 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6306 | Standard for Binary Floating-Point Arithmetic.
6307 *----------------------------------------------------------------------------*/
6309 float128 float128_sub(float128 a, float128 b, float_status *status)
6313 aSign = extractFloat128Sign( a );
6314 bSign = extractFloat128Sign( b );
6315 if ( aSign == bSign ) {
6316 return subFloat128Sigs(a, b, aSign, status);
6319 return addFloat128Sigs(a, b, aSign, status);
6324 /*----------------------------------------------------------------------------
6325 | Returns the result of multiplying the quadruple-precision floating-point
6326 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6327 | Standard for Binary Floating-Point Arithmetic.
6328 *----------------------------------------------------------------------------*/
6330 float128 float128_mul(float128 a, float128 b, float_status *status)
6332 flag aSign, bSign, zSign;
6333 int32_t aExp, bExp, zExp;
6334 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6336 aSig1 = extractFloat128Frac1( a );
6337 aSig0 = extractFloat128Frac0( a );
6338 aExp = extractFloat128Exp( a );
6339 aSign = extractFloat128Sign( a );
6340 bSig1 = extractFloat128Frac1( b );
6341 bSig0 = extractFloat128Frac0( b );
6342 bExp = extractFloat128Exp( b );
6343 bSign = extractFloat128Sign( b );
6344 zSign = aSign ^ bSign;
6345 if ( aExp == 0x7FFF ) {
6346 if ( ( aSig0 | aSig1 )
6347 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6348 return propagateFloat128NaN(a, b, status);
6350 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6351 return packFloat128( zSign, 0x7FFF, 0, 0 );
6353 if ( bExp == 0x7FFF ) {
6354 if (bSig0 | bSig1) {
6355 return propagateFloat128NaN(a, b, status);
6357 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6359 float_raise(float_flag_invalid, status);
6360 return float128_default_nan(status);
6362 return packFloat128( zSign, 0x7FFF, 0, 0 );
6365 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6366 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6369 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6370 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6372 zExp = aExp + bExp - 0x4000;
6373 aSig0 |= LIT64( 0x0001000000000000 );
6374 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6375 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6376 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6377 zSig2 |= ( zSig3 != 0 );
6378 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6379 shift128ExtraRightJamming(
6380 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6383 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6387 /*----------------------------------------------------------------------------
6388 | Returns the result of dividing the quadruple-precision floating-point value
6389 | `a' by the corresponding value `b'. The operation is performed according to
6390 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6391 *----------------------------------------------------------------------------*/
6393 float128 float128_div(float128 a, float128 b, float_status *status)
6395 flag aSign, bSign, zSign;
6396 int32_t aExp, bExp, zExp;
6397 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6398 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6400 aSig1 = extractFloat128Frac1( a );
6401 aSig0 = extractFloat128Frac0( a );
6402 aExp = extractFloat128Exp( a );
6403 aSign = extractFloat128Sign( a );
6404 bSig1 = extractFloat128Frac1( b );
6405 bSig0 = extractFloat128Frac0( b );
6406 bExp = extractFloat128Exp( b );
6407 bSign = extractFloat128Sign( b );
6408 zSign = aSign ^ bSign;
6409 if ( aExp == 0x7FFF ) {
6410 if (aSig0 | aSig1) {
6411 return propagateFloat128NaN(a, b, status);
6413 if ( bExp == 0x7FFF ) {
6414 if (bSig0 | bSig1) {
6415 return propagateFloat128NaN(a, b, status);
6419 return packFloat128( zSign, 0x7FFF, 0, 0 );
6421 if ( bExp == 0x7FFF ) {
6422 if (bSig0 | bSig1) {
6423 return propagateFloat128NaN(a, b, status);
6425 return packFloat128( zSign, 0, 0, 0 );
6428 if ( ( bSig0 | bSig1 ) == 0 ) {
6429 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6431 float_raise(float_flag_invalid, status);
6432 return float128_default_nan(status);
6434 float_raise(float_flag_divbyzero, status);
6435 return packFloat128( zSign, 0x7FFF, 0, 0 );
6437 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6440 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6441 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6443 zExp = aExp - bExp + 0x3FFD;
6445 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6447 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6448 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6449 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6452 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6453 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6454 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6455 while ( (int64_t) rem0 < 0 ) {
6457 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6459 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6460 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6461 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6462 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6463 while ( (int64_t) rem1 < 0 ) {
6465 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6467 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6469 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6470 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6474 /*----------------------------------------------------------------------------
6475 | Returns the remainder of the quadruple-precision floating-point value `a'
6476 | with respect to the corresponding value `b'. The operation is performed
6477 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6478 *----------------------------------------------------------------------------*/
6480 float128 float128_rem(float128 a, float128 b, float_status *status)
6483 int32_t aExp, bExp, expDiff;
6484 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6485 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6488 aSig1 = extractFloat128Frac1( a );
6489 aSig0 = extractFloat128Frac0( a );
6490 aExp = extractFloat128Exp( a );
6491 aSign = extractFloat128Sign( a );
6492 bSig1 = extractFloat128Frac1( b );
6493 bSig0 = extractFloat128Frac0( b );
6494 bExp = extractFloat128Exp( b );
6495 if ( aExp == 0x7FFF ) {
6496 if ( ( aSig0 | aSig1 )
6497 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6498 return propagateFloat128NaN(a, b, status);
6502 if ( bExp == 0x7FFF ) {
6503 if (bSig0 | bSig1) {
6504 return propagateFloat128NaN(a, b, status);
6509 if ( ( bSig0 | bSig1 ) == 0 ) {
6511 float_raise(float_flag_invalid, status);
6512 return float128_default_nan(status);
6514 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6517 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6518 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6520 expDiff = aExp - bExp;
6521 if ( expDiff < -1 ) return a;
6523 aSig0 | LIT64( 0x0001000000000000 ),
6525 15 - ( expDiff < 0 ),
6530 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6531 q = le128( bSig0, bSig1, aSig0, aSig1 );
6532 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6534 while ( 0 < expDiff ) {
6535 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6536 q = ( 4 < q ) ? q - 4 : 0;
6537 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6538 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6539 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6540 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6543 if ( -64 < expDiff ) {
6544 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6545 q = ( 4 < q ) ? q - 4 : 0;
6547 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6549 if ( expDiff < 0 ) {
6550 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6553 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6555 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6556 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6559 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6560 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6563 alternateASig0 = aSig0;
6564 alternateASig1 = aSig1;
6566 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6567 } while ( 0 <= (int64_t) aSig0 );
6569 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6570 if ( ( sigMean0 < 0 )
6571 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6572 aSig0 = alternateASig0;
6573 aSig1 = alternateASig1;
6575 zSign = ( (int64_t) aSig0 < 0 );
6576 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6577 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6581 /*----------------------------------------------------------------------------
6582 | Returns the square root of the quadruple-precision floating-point value `a'.
6583 | The operation is performed according to the IEC/IEEE Standard for Binary
6584 | Floating-Point Arithmetic.
6585 *----------------------------------------------------------------------------*/
6587 float128 float128_sqrt(float128 a, float_status *status)
6591 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6592 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6594 aSig1 = extractFloat128Frac1( a );
6595 aSig0 = extractFloat128Frac0( a );
6596 aExp = extractFloat128Exp( a );
6597 aSign = extractFloat128Sign( a );
6598 if ( aExp == 0x7FFF ) {
6599 if (aSig0 | aSig1) {
6600 return propagateFloat128NaN(a, a, status);
6602 if ( ! aSign ) return a;
6606 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6608 float_raise(float_flag_invalid, status);
6609 return float128_default_nan(status);
6612 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6613 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6615 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6616 aSig0 |= LIT64( 0x0001000000000000 );
6617 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6618 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6619 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6620 doubleZSig0 = zSig0<<1;
6621 mul64To128( zSig0, zSig0, &term0, &term1 );
6622 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6623 while ( (int64_t) rem0 < 0 ) {
6626 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6628 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6629 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6630 if ( zSig1 == 0 ) zSig1 = 1;
6631 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6632 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6633 mul64To128( zSig1, zSig1, &term2, &term3 );
6634 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6635 while ( (int64_t) rem1 < 0 ) {
6637 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6639 term2 |= doubleZSig0;
6640 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6642 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6644 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6645 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6649 /*----------------------------------------------------------------------------
6650 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6651 | the corresponding value `b', and 0 otherwise. The invalid exception is
6652 | raised if either operand is a NaN. Otherwise, the comparison is performed
6653 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6654 *----------------------------------------------------------------------------*/
6656 int float128_eq(float128 a, float128 b, float_status *status)
6659 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6660 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6661 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6662 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6664 float_raise(float_flag_invalid, status);
6669 && ( ( a.high == b.high )
6671 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6676 /*----------------------------------------------------------------------------
6677 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6678 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6679 | exception is raised if either operand is a NaN. The comparison is performed
6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6681 *----------------------------------------------------------------------------*/
6683 int float128_le(float128 a, float128 b, float_status *status)
6687 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6688 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6689 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6690 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6692 float_raise(float_flag_invalid, status);
6695 aSign = extractFloat128Sign( a );
6696 bSign = extractFloat128Sign( b );
6697 if ( aSign != bSign ) {
6700 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6704 aSign ? le128( b.high, b.low, a.high, a.low )
6705 : le128( a.high, a.low, b.high, b.low );
6709 /*----------------------------------------------------------------------------
6710 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6711 | the corresponding value `b', and 0 otherwise. The invalid exception is
6712 | raised if either operand is a NaN. The comparison is performed according
6713 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6714 *----------------------------------------------------------------------------*/
6716 int float128_lt(float128 a, float128 b, float_status *status)
6720 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6721 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6722 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6723 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6725 float_raise(float_flag_invalid, status);
6728 aSign = extractFloat128Sign( a );
6729 bSign = extractFloat128Sign( b );
6730 if ( aSign != bSign ) {
6733 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6737 aSign ? lt128( b.high, b.low, a.high, a.low )
6738 : lt128( a.high, a.low, b.high, b.low );
6742 /*----------------------------------------------------------------------------
6743 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6744 | be compared, and 0 otherwise. The invalid exception is raised if either
6745 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6746 | Standard for Binary Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6749 int float128_unordered(float128 a, float128 b, float_status *status)
6751 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6752 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6753 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6754 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6756 float_raise(float_flag_invalid, status);
6762 /*----------------------------------------------------------------------------
6763 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6764 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6765 | exception. The comparison is performed according to the IEC/IEEE Standard
6766 | for Binary Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6769 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6772 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6773 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6774 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6775 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6777 if (float128_is_signaling_nan(a, status)
6778 || float128_is_signaling_nan(b, status)) {
6779 float_raise(float_flag_invalid, status);
6785 && ( ( a.high == b.high )
6787 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6792 /*----------------------------------------------------------------------------
6793 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6794 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6795 | cause an exception. Otherwise, the comparison is performed according to the
6796 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6797 *----------------------------------------------------------------------------*/
6799 int float128_le_quiet(float128 a, float128 b, float_status *status)
6803 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6804 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6805 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6806 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6808 if (float128_is_signaling_nan(a, status)
6809 || float128_is_signaling_nan(b, status)) {
6810 float_raise(float_flag_invalid, status);
6814 aSign = extractFloat128Sign( a );
6815 bSign = extractFloat128Sign( b );
6816 if ( aSign != bSign ) {
6819 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6823 aSign ? le128( b.high, b.low, a.high, a.low )
6824 : le128( a.high, a.low, b.high, b.low );
6828 /*----------------------------------------------------------------------------
6829 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6830 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6831 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6832 | Standard for Binary Floating-Point Arithmetic.
6833 *----------------------------------------------------------------------------*/
6835 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6839 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6840 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6841 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6842 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6844 if (float128_is_signaling_nan(a, status)
6845 || float128_is_signaling_nan(b, status)) {
6846 float_raise(float_flag_invalid, status);
6850 aSign = extractFloat128Sign( a );
6851 bSign = extractFloat128Sign( b );
6852 if ( aSign != bSign ) {
6855 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6859 aSign ? lt128( b.high, b.low, a.high, a.low )
6860 : lt128( a.high, a.low, b.high, b.low );
6864 /*----------------------------------------------------------------------------
6865 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6866 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6867 | comparison is performed according to the IEC/IEEE Standard for Binary
6868 | Floating-Point Arithmetic.
6869 *----------------------------------------------------------------------------*/
6871 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6873 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6874 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6875 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6876 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6878 if (float128_is_signaling_nan(a, status)
6879 || float128_is_signaling_nan(b, status)) {
6880 float_raise(float_flag_invalid, status);
6887 #define COMPARE(s, nan_exp) \
6888 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
6889 int is_quiet, float_status *status) \
6891 flag aSign, bSign; \
6892 uint ## s ## _t av, bv; \
6893 a = float ## s ## _squash_input_denormal(a, status); \
6894 b = float ## s ## _squash_input_denormal(b, status); \
6896 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6897 extractFloat ## s ## Frac( a ) ) || \
6898 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6899 extractFloat ## s ## Frac( b ) )) { \
6901 float ## s ## _is_signaling_nan(a, status) || \
6902 float ## s ## _is_signaling_nan(b, status)) { \
6903 float_raise(float_flag_invalid, status); \
6905 return float_relation_unordered; \
6907 aSign = extractFloat ## s ## Sign( a ); \
6908 bSign = extractFloat ## s ## Sign( b ); \
6909 av = float ## s ## _val(a); \
6910 bv = float ## s ## _val(b); \
6911 if ( aSign != bSign ) { \
6912 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6914 return float_relation_equal; \
6916 return 1 - (2 * aSign); \
6920 return float_relation_equal; \
6922 return 1 - 2 * (aSign ^ ( av < bv )); \
6927 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
6929 return float ## s ## _compare_internal(a, b, 0, status); \
6932 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
6933 float_status *status) \
6935 return float ## s ## _compare_internal(a, b, 1, status); \
6941 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6942 int is_quiet, float_status *status)
6946 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6947 float_raise(float_flag_invalid, status);
6948 return float_relation_unordered;
6950 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6951 ( extractFloatx80Frac( a )<<1 ) ) ||
6952 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6953 ( extractFloatx80Frac( b )<<1 ) )) {
6955 floatx80_is_signaling_nan(a, status) ||
6956 floatx80_is_signaling_nan(b, status)) {
6957 float_raise(float_flag_invalid, status);
6959 return float_relation_unordered;
6961 aSign = extractFloatx80Sign( a );
6962 bSign = extractFloatx80Sign( b );
6963 if ( aSign != bSign ) {
6965 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6966 ( ( a.low | b.low ) == 0 ) ) {
6968 return float_relation_equal;
6970 return 1 - (2 * aSign);
6973 if (a.low == b.low && a.high == b.high) {
6974 return float_relation_equal;
6976 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6981 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6983 return floatx80_compare_internal(a, b, 0, status);
6986 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6988 return floatx80_compare_internal(a, b, 1, status);
6991 static inline int float128_compare_internal(float128 a, float128 b,
6992 int is_quiet, float_status *status)
6996 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6997 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6998 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6999 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7001 float128_is_signaling_nan(a, status) ||
7002 float128_is_signaling_nan(b, status)) {
7003 float_raise(float_flag_invalid, status);
7005 return float_relation_unordered;
7007 aSign = extractFloat128Sign( a );
7008 bSign = extractFloat128Sign( b );
7009 if ( aSign != bSign ) {
7010 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7012 return float_relation_equal;
7014 return 1 - (2 * aSign);
7017 if (a.low == b.low && a.high == b.high) {
7018 return float_relation_equal;
7020 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7025 int float128_compare(float128 a, float128 b, float_status *status)
7027 return float128_compare_internal(a, b, 0, status);
7030 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7032 return float128_compare_internal(a, b, 1, status);
7035 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7041 if (floatx80_invalid_encoding(a)) {
7042 float_raise(float_flag_invalid, status);
7043 return floatx80_default_nan(status);
7045 aSig = extractFloatx80Frac( a );
7046 aExp = extractFloatx80Exp( a );
7047 aSign = extractFloatx80Sign( a );
7049 if ( aExp == 0x7FFF ) {
7051 return propagateFloatx80NaN(a, a, status);
7065 } else if (n < -0x10000) {
7070 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7071 aSign, aExp, aSig, 0, status);
7074 float128 float128_scalbn(float128 a, int n, float_status *status)
7078 uint64_t aSig0, aSig1;
7080 aSig1 = extractFloat128Frac1( a );
7081 aSig0 = extractFloat128Frac0( a );
7082 aExp = extractFloat128Exp( a );
7083 aSign = extractFloat128Sign( a );
7084 if ( aExp == 0x7FFF ) {
7085 if ( aSig0 | aSig1 ) {
7086 return propagateFloat128NaN(a, a, status);
7091 aSig0 |= LIT64( 0x0001000000000000 );
7092 } else if (aSig0 == 0 && aSig1 == 0) {
7100 } else if (n < -0x10000) {
7105 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1