4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the sign bit of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline flag extractFloat16Sign(float16 a)
122 return float16_val(a)>>15;
125 /*----------------------------------------------------------------------------
126 | Returns the fraction bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline uint32_t extractFloat32Frac(float32 a)
131 return float32_val(a) & 0x007FFFFF;
134 /*----------------------------------------------------------------------------
135 | Returns the exponent bits of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline int extractFloat32Exp(float32 a)
140 return (float32_val(a) >> 23) & 0xFF;
143 /*----------------------------------------------------------------------------
144 | Returns the sign bit of the single-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline flag extractFloat32Sign(float32 a)
149 return float32_val(a) >> 31;
152 /*----------------------------------------------------------------------------
153 | Returns the fraction bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline uint64_t extractFloat64Frac(float64 a)
158 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
161 /*----------------------------------------------------------------------------
162 | Returns the exponent bits of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline int extractFloat64Exp(float64 a)
167 return (float64_val(a) >> 52) & 0x7FF;
170 /*----------------------------------------------------------------------------
171 | Returns the sign bit of the double-precision floating-point value `a'.
172 *----------------------------------------------------------------------------*/
174 static inline flag extractFloat64Sign(float64 a)
176 return float64_val(a) >> 63;
180 * Classify a floating point number. Everything above float_class_qnan
181 * is a NaN so cls >= float_class_qnan is any NaN.
184 typedef enum __attribute__ ((__packed__)) {
185 float_class_unclassified,
189 float_class_qnan, /* all NaNs from here */
191 float_class_msnan, /* maybe silenced */
195 * Structure holding all of the decomposed parts of a float. The
196 * exponent is unbiased and the fraction is normalized. All
197 * calculations are done with a 64 bit fraction and then rounded as
198 * appropriate for the final format.
200 * Thanks to the packed FloatClass a decent compiler should be able to
201 * fit the whole structure into registers and avoid using the stack
202 * for parameter passing.
212 #define DECOMPOSED_BINARY_POINT (64 - 2)
213 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
214 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
216 /* Structure holding all of the relevant parameters for a format.
217 * exp_size: the size of the exponent field
218 * exp_bias: the offset applied to the exponent field
219 * exp_max: the maximum normalised exponent
220 * frac_size: the size of the fraction field
221 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
222 * The following are computed based the size of fraction
223 * frac_lsb: least significant bit of fraction
224 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
225 * round_mask/roundeven_mask: masks used for rounding
236 uint64_t roundeven_mask;
239 /* Expand fields based on the size of exponent and fraction */
240 #define FLOAT_PARAMS(E, F) \
242 .exp_bias = ((1 << E) - 1) >> 1, \
243 .exp_max = (1 << E) - 1, \
245 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
246 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
247 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
248 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
249 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
251 static const FloatFmt float16_params = {
255 static const FloatFmt float32_params = {
259 static const FloatFmt float64_params = {
263 /* Unpack a float to parts, but do not canonicalize. */
264 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
266 const int sign_pos = fmt.frac_size + fmt.exp_size;
268 return (FloatParts) {
269 .cls = float_class_unclassified,
270 .sign = extract64(raw, sign_pos, 1),
271 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
272 .frac = extract64(raw, 0, fmt.frac_size),
276 static inline FloatParts float16_unpack_raw(float16 f)
278 return unpack_raw(float16_params, f);
281 static inline FloatParts float32_unpack_raw(float32 f)
283 return unpack_raw(float32_params, f);
286 static inline FloatParts float64_unpack_raw(float64 f)
288 return unpack_raw(float64_params, f);
291 /* Pack a float from parts, but do not canonicalize. */
292 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
294 const int sign_pos = fmt.frac_size + fmt.exp_size;
295 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
296 return deposit64(ret, sign_pos, 1, p.sign);
299 static inline float16 float16_pack_raw(FloatParts p)
301 return make_float16(pack_raw(float16_params, p));
304 static inline float32 float32_pack_raw(FloatParts p)
306 return make_float32(pack_raw(float32_params, p));
309 static inline float64 float64_pack_raw(FloatParts p)
311 return make_float64(pack_raw(float64_params, p));
314 /*----------------------------------------------------------------------------
315 | Functions and definitions to determine: (1) whether tininess for underflow
316 | is detected before or after rounding by default, (2) what (if anything)
317 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
318 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
319 | are propagated from function inputs to output. These details are target-
321 *----------------------------------------------------------------------------*/
322 #include "softfloat-specialize.h"
324 /* Canonicalize EXP and FRAC, setting CLS. */
325 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
326 float_status *status)
328 if (part.exp == parm->exp_max) {
329 if (part.frac == 0) {
330 part.cls = float_class_inf;
332 part.frac <<= parm->frac_shift;
333 part.cls = (parts_is_snan_frac(part.frac, status)
334 ? float_class_snan : float_class_qnan);
336 } else if (part.exp == 0) {
337 if (likely(part.frac == 0)) {
338 part.cls = float_class_zero;
339 } else if (status->flush_inputs_to_zero) {
340 float_raise(float_flag_input_denormal, status);
341 part.cls = float_class_zero;
344 int shift = clz64(part.frac) - 1;
345 part.cls = float_class_normal;
346 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
350 part.cls = float_class_normal;
351 part.exp -= parm->exp_bias;
352 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
357 /* Round and uncanonicalize a floating-point number by parts. There
358 * are FRAC_SHIFT bits that may require rounding at the bottom of the
359 * fraction; these bits will be removed. The exponent will be biased
360 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
363 static FloatParts round_canonical(FloatParts p, float_status *s,
364 const FloatFmt *parm)
366 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
367 const uint64_t round_mask = parm->round_mask;
368 const uint64_t roundeven_mask = parm->roundeven_mask;
369 const int exp_max = parm->exp_max;
370 const int frac_shift = parm->frac_shift;
379 case float_class_normal:
380 switch (s->float_rounding_mode) {
381 case float_round_nearest_even:
382 overflow_norm = false;
383 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
385 case float_round_ties_away:
386 overflow_norm = false;
389 case float_round_to_zero:
390 overflow_norm = true;
394 inc = p.sign ? 0 : round_mask;
395 overflow_norm = p.sign;
397 case float_round_down:
398 inc = p.sign ? round_mask : 0;
399 overflow_norm = !p.sign;
402 g_assert_not_reached();
405 exp += parm->exp_bias;
406 if (likely(exp > 0)) {
407 if (frac & round_mask) {
408 flags |= float_flag_inexact;
410 if (frac & DECOMPOSED_OVERFLOW_BIT) {
417 if (unlikely(exp >= exp_max)) {
418 flags |= float_flag_overflow | float_flag_inexact;
423 p.cls = float_class_inf;
427 } else if (s->flush_to_zero) {
428 flags |= float_flag_output_denormal;
429 p.cls = float_class_zero;
432 bool is_tiny = (s->float_detect_tininess
433 == float_tininess_before_rounding)
435 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
437 shift64RightJamming(frac, 1 - exp, &frac);
438 if (frac & round_mask) {
439 /* Need to recompute round-to-even. */
440 if (s->float_rounding_mode == float_round_nearest_even) {
441 inc = ((frac & roundeven_mask) != frac_lsbm1
444 flags |= float_flag_inexact;
448 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
451 if (is_tiny && (flags & float_flag_inexact)) {
452 flags |= float_flag_underflow;
454 if (exp == 0 && frac == 0) {
455 p.cls = float_class_zero;
460 case float_class_zero:
466 case float_class_inf:
472 case float_class_qnan:
473 case float_class_snan:
475 frac >>= parm->frac_shift;
479 g_assert_not_reached();
482 float_raise(flags, s);
488 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
490 return canonicalize(float16_unpack_raw(f), &float16_params, s);
493 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
496 case float_class_msnan:
497 p.frac >>= float16_params.frac_shift;
498 return float16_maybe_silence_nan(float16_pack_raw(p), s);
500 p = round_canonical(p, s, &float16_params);
501 return float16_pack_raw(p);
505 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
507 return canonicalize(float32_unpack_raw(f), &float32_params, s);
510 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
513 case float_class_msnan:
514 p.frac >>= float32_params.frac_shift;
515 return float32_maybe_silence_nan(float32_pack_raw(p), s);
517 p = round_canonical(p, s, &float32_params);
518 return float32_pack_raw(p);
522 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
524 return canonicalize(float64_unpack_raw(f), &float64_params, s);
527 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
530 case float_class_msnan:
531 p.frac >>= float64_params.frac_shift;
532 return float64_maybe_silence_nan(float64_pack_raw(p), s);
534 p = round_canonical(p, s, &float64_params);
535 return float64_pack_raw(p);
539 /* Simple helpers for checking if what NaN we have */
540 static bool is_nan(FloatClass c)
542 return unlikely(c >= float_class_qnan);
544 static bool is_snan(FloatClass c)
546 return c == float_class_snan;
548 static bool is_qnan(FloatClass c)
550 return c == float_class_qnan;
553 static FloatParts return_nan(FloatParts a, float_status *s)
556 case float_class_snan:
557 s->float_exception_flags |= float_flag_invalid;
558 a.cls = float_class_msnan;
560 case float_class_qnan:
561 if (s->default_nan_mode) {
562 return parts_default_nan(s);
567 g_assert_not_reached();
572 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
574 if (is_snan(a.cls) || is_snan(b.cls)) {
575 s->float_exception_flags |= float_flag_invalid;
578 if (s->default_nan_mode) {
579 return parts_default_nan(s);
581 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
582 is_qnan(b.cls), is_snan(b.cls),
584 (a.frac == b.frac && a.sign < b.sign))) {
587 a.cls = float_class_msnan;
592 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
593 bool inf_zero, float_status *s)
597 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
598 s->float_exception_flags |= float_flag_invalid;
601 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
602 is_qnan(b.cls), is_snan(b.cls),
603 is_qnan(c.cls), is_snan(c.cls),
606 if (s->default_nan_mode) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
623 return parts_default_nan(s);
625 g_assert_not_reached();
627 a.cls = float_class_msnan;
633 * Returns the result of adding or subtracting the values of the
634 * floating-point values `a' and `b'. The operation is performed
635 * according to the IEC/IEEE Standard for Binary Floating-Point
639 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642 bool a_sign = a.sign;
643 bool b_sign = b.sign ^ subtract;
645 if (a_sign != b_sign) {
648 if (a.cls == float_class_normal && b.cls == float_class_normal) {
649 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
650 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
651 a.frac = a.frac - b.frac;
653 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
654 a.frac = b.frac - a.frac;
660 a.cls = float_class_zero;
661 a.sign = s->float_rounding_mode == float_round_down;
663 int shift = clz64(a.frac) - 1;
664 a.frac = a.frac << shift;
665 a.exp = a.exp - shift;
670 if (is_nan(a.cls) || is_nan(b.cls)) {
671 return pick_nan(a, b, s);
673 if (a.cls == float_class_inf) {
674 if (b.cls == float_class_inf) {
675 float_raise(float_flag_invalid, s);
676 return parts_default_nan(s);
680 if (a.cls == float_class_zero && b.cls == float_class_zero) {
681 a.sign = s->float_rounding_mode == float_round_down;
684 if (a.cls == float_class_zero || b.cls == float_class_inf) {
688 if (b.cls == float_class_zero) {
693 if (a.cls == float_class_normal && b.cls == float_class_normal) {
695 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
696 } else if (a.exp < b.exp) {
697 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
701 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
707 if (is_nan(a.cls) || is_nan(b.cls)) {
708 return pick_nan(a, b, s);
710 if (a.cls == float_class_inf || b.cls == float_class_zero) {
713 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 g_assert_not_reached();
722 * Returns the result of adding or subtracting the floating-point
723 * values `a' and `b'. The operation is performed according to the
724 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
727 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
728 float_status *status)
730 FloatParts pa = float16_unpack_canonical(a, status);
731 FloatParts pb = float16_unpack_canonical(b, status);
732 FloatParts pr = addsub_floats(pa, pb, false, status);
734 return float16_round_pack_canonical(pr, status);
737 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
738 float_status *status)
740 FloatParts pa = float32_unpack_canonical(a, status);
741 FloatParts pb = float32_unpack_canonical(b, status);
742 FloatParts pr = addsub_floats(pa, pb, false, status);
744 return float32_round_pack_canonical(pr, status);
747 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
748 float_status *status)
750 FloatParts pa = float64_unpack_canonical(a, status);
751 FloatParts pb = float64_unpack_canonical(b, status);
752 FloatParts pr = addsub_floats(pa, pb, false, status);
754 return float64_round_pack_canonical(pr, status);
757 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
758 float_status *status)
760 FloatParts pa = float16_unpack_canonical(a, status);
761 FloatParts pb = float16_unpack_canonical(b, status);
762 FloatParts pr = addsub_floats(pa, pb, true, status);
764 return float16_round_pack_canonical(pr, status);
767 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
768 float_status *status)
770 FloatParts pa = float32_unpack_canonical(a, status);
771 FloatParts pb = float32_unpack_canonical(b, status);
772 FloatParts pr = addsub_floats(pa, pb, true, status);
774 return float32_round_pack_canonical(pr, status);
777 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
778 float_status *status)
780 FloatParts pa = float64_unpack_canonical(a, status);
781 FloatParts pb = float64_unpack_canonical(b, status);
782 FloatParts pr = addsub_floats(pa, pb, true, status);
784 return float64_round_pack_canonical(pr, status);
788 * Returns the result of multiplying the floating-point values `a' and
789 * `b'. The operation is performed according to the IEC/IEEE Standard
790 * for Binary Floating-Point Arithmetic.
793 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
795 bool sign = a.sign ^ b.sign;
797 if (a.cls == float_class_normal && b.cls == float_class_normal) {
799 int exp = a.exp + b.exp;
801 mul64To128(a.frac, b.frac, &hi, &lo);
802 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
803 if (lo & DECOMPOSED_OVERFLOW_BIT) {
804 shift64RightJamming(lo, 1, &lo);
814 /* handle all the NaN cases */
815 if (is_nan(a.cls) || is_nan(b.cls)) {
816 return pick_nan(a, b, s);
818 /* Inf * Zero == NaN */
819 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
820 (a.cls == float_class_zero && b.cls == float_class_inf)) {
821 s->float_exception_flags |= float_flag_invalid;
822 return parts_default_nan(s);
824 /* Multiply by 0 or Inf */
825 if (a.cls == float_class_inf || a.cls == float_class_zero) {
829 if (b.cls == float_class_inf || b.cls == float_class_zero) {
833 g_assert_not_reached();
836 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
837 float_status *status)
839 FloatParts pa = float16_unpack_canonical(a, status);
840 FloatParts pb = float16_unpack_canonical(b, status);
841 FloatParts pr = mul_floats(pa, pb, status);
843 return float16_round_pack_canonical(pr, status);
846 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
847 float_status *status)
849 FloatParts pa = float32_unpack_canonical(a, status);
850 FloatParts pb = float32_unpack_canonical(b, status);
851 FloatParts pr = mul_floats(pa, pb, status);
853 return float32_round_pack_canonical(pr, status);
856 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
857 float_status *status)
859 FloatParts pa = float64_unpack_canonical(a, status);
860 FloatParts pb = float64_unpack_canonical(b, status);
861 FloatParts pr = mul_floats(pa, pb, status);
863 return float64_round_pack_canonical(pr, status);
867 * Returns the result of multiplying the floating-point values `a' and
868 * `b' then adding 'c', with no intermediate rounding step after the
869 * multiplication. The operation is performed according to the
870 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
871 * The flags argument allows the caller to select negation of the
872 * addend, the intermediate product, or the final result. (The
873 * difference between this and having the caller do a separate
874 * negation is that negating externally will flip the sign bit on
878 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
879 int flags, float_status *s)
881 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
882 ((1 << float_class_inf) | (1 << float_class_zero));
884 bool sign_flip = flags & float_muladd_negate_result;
889 /* It is implementation-defined whether the cases of (0,inf,qnan)
890 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
891 * they return if they do), so we have to hand this information
892 * off to the target-specific pick-a-NaN routine.
894 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
895 return pick_nan_muladd(a, b, c, inf_zero, s);
899 s->float_exception_flags |= float_flag_invalid;
900 return parts_default_nan(s);
903 if (flags & float_muladd_negate_c) {
907 p_sign = a.sign ^ b.sign;
909 if (flags & float_muladd_negate_product) {
913 if (a.cls == float_class_inf || b.cls == float_class_inf) {
914 p_class = float_class_inf;
915 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
916 p_class = float_class_zero;
918 p_class = float_class_normal;
921 if (c.cls == float_class_inf) {
922 if (p_class == float_class_inf && p_sign != c.sign) {
923 s->float_exception_flags |= float_flag_invalid;
924 return parts_default_nan(s);
926 a.cls = float_class_inf;
927 a.sign = c.sign ^ sign_flip;
932 if (p_class == float_class_inf) {
933 a.cls = float_class_inf;
934 a.sign = p_sign ^ sign_flip;
938 if (p_class == float_class_zero) {
939 if (c.cls == float_class_zero) {
940 if (p_sign != c.sign) {
941 p_sign = s->float_rounding_mode == float_round_down;
944 } else if (flags & float_muladd_halve_result) {
951 /* a & b should be normals now... */
952 assert(a.cls == float_class_normal &&
953 b.cls == float_class_normal);
955 p_exp = a.exp + b.exp;
957 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960 mul64To128(a.frac, b.frac, &hi, &lo);
961 /* binary point now at bit 124 */
963 /* check for overflow */
964 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
965 shift128RightJamming(hi, lo, 1, &hi, &lo);
970 if (c.cls == float_class_zero) {
971 /* move binary point back to 62 */
972 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
974 int exp_diff = p_exp - c.exp;
975 if (p_sign == c.sign) {
978 shift128RightJamming(hi, lo,
979 DECOMPOSED_BINARY_POINT - exp_diff,
985 /* shift c to the same binary point as the product (124) */
988 shift128RightJamming(c_hi, c_lo,
991 add128(hi, lo, c_hi, c_lo, &hi, &lo);
992 /* move binary point back to 62 */
993 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
996 if (lo & DECOMPOSED_OVERFLOW_BIT) {
997 shift64RightJamming(lo, 1, &lo);
1003 uint64_t c_hi, c_lo;
1004 /* make C binary point match product at bit 124 */
1008 if (exp_diff <= 0) {
1009 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1013 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1015 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1020 shift128RightJamming(c_hi, c_lo,
1023 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026 if (hi == 0 && lo == 0) {
1027 a.cls = float_class_zero;
1028 a.sign = s->float_rounding_mode == float_round_down;
1029 a.sign ^= sign_flip;
1036 shift = clz64(lo) + 64;
1038 /* Normalizing to a binary point of 124 is the
1039 correct adjust for the exponent. However since we're
1040 shifting, we might as well put the binary point back
1041 at 62 where we really want it. Therefore shift as
1042 if we're leaving 1 bit at the top of the word, but
1043 adjust the exponent as if we're leaving 3 bits. */
1046 lo = lo << (shift - 64);
1048 hi = (hi << shift) | (lo >> (64 - shift));
1049 lo = hi | ((lo << shift) != 0);
1056 if (flags & float_muladd_halve_result) {
1060 /* finally prepare our result */
1061 a.cls = float_class_normal;
1062 a.sign = p_sign ^ sign_flip;
1069 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1070 int flags, float_status *status)
1072 FloatParts pa = float16_unpack_canonical(a, status);
1073 FloatParts pb = float16_unpack_canonical(b, status);
1074 FloatParts pc = float16_unpack_canonical(c, status);
1075 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1077 return float16_round_pack_canonical(pr, status);
1080 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1081 int flags, float_status *status)
1083 FloatParts pa = float32_unpack_canonical(a, status);
1084 FloatParts pb = float32_unpack_canonical(b, status);
1085 FloatParts pc = float32_unpack_canonical(c, status);
1086 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1088 return float32_round_pack_canonical(pr, status);
1091 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1092 int flags, float_status *status)
1094 FloatParts pa = float64_unpack_canonical(a, status);
1095 FloatParts pb = float64_unpack_canonical(b, status);
1096 FloatParts pc = float64_unpack_canonical(c, status);
1097 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1099 return float64_round_pack_canonical(pr, status);
1103 * Returns the result of dividing the floating-point value `a' by the
1104 * corresponding value `b'. The operation is performed according to
1105 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1110 bool sign = a.sign ^ b.sign;
1112 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1113 uint64_t temp_lo, temp_hi;
1114 int exp = a.exp - b.exp;
1115 if (a.frac < b.frac) {
1117 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1118 &temp_hi, &temp_lo);
1120 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1121 &temp_hi, &temp_lo);
1123 /* LSB of quot is set if inexact which roundandpack will use
1124 * to set flags. Yet again we re-use a for the result */
1125 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1130 /* handle all the NaN cases */
1131 if (is_nan(a.cls) || is_nan(b.cls)) {
1132 return pick_nan(a, b, s);
1134 /* 0/0 or Inf/Inf */
1137 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1138 s->float_exception_flags |= float_flag_invalid;
1139 return parts_default_nan(s);
1141 /* Inf / x or 0 / x */
1142 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1147 if (b.cls == float_class_zero) {
1148 s->float_exception_flags |= float_flag_divbyzero;
1149 a.cls = float_class_inf;
1154 if (b.cls == float_class_inf) {
1155 a.cls = float_class_zero;
1159 g_assert_not_reached();
1162 float16 float16_div(float16 a, float16 b, float_status *status)
1164 FloatParts pa = float16_unpack_canonical(a, status);
1165 FloatParts pb = float16_unpack_canonical(b, status);
1166 FloatParts pr = div_floats(pa, pb, status);
1168 return float16_round_pack_canonical(pr, status);
1171 float32 float32_div(float32 a, float32 b, float_status *status)
1173 FloatParts pa = float32_unpack_canonical(a, status);
1174 FloatParts pb = float32_unpack_canonical(b, status);
1175 FloatParts pr = div_floats(pa, pb, status);
1177 return float32_round_pack_canonical(pr, status);
1180 float64 float64_div(float64 a, float64 b, float_status *status)
1182 FloatParts pa = float64_unpack_canonical(a, status);
1183 FloatParts pb = float64_unpack_canonical(b, status);
1184 FloatParts pr = div_floats(pa, pb, status);
1186 return float64_round_pack_canonical(pr, status);
1190 * Rounds the floating-point value `a' to an integer, and returns the
1191 * result as a floating-point value. The operation is performed
1192 * according to the IEC/IEEE Standard for Binary Floating-Point
1196 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1198 if (is_nan(a.cls)) {
1199 return return_nan(a, s);
1203 case float_class_zero:
1204 case float_class_inf:
1205 case float_class_qnan:
1206 /* already "integral" */
1208 case float_class_normal:
1209 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1210 /* already integral */
1215 /* all fractional */
1216 s->float_exception_flags |= float_flag_inexact;
1217 switch (rounding_mode) {
1218 case float_round_nearest_even:
1219 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1221 case float_round_ties_away:
1222 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1224 case float_round_to_zero:
1227 case float_round_up:
1230 case float_round_down:
1234 g_assert_not_reached();
1238 a.frac = DECOMPOSED_IMPLICIT_BIT;
1241 a.cls = float_class_zero;
1244 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1245 uint64_t frac_lsbm1 = frac_lsb >> 1;
1246 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1247 uint64_t rnd_mask = rnd_even_mask >> 1;
1250 switch (rounding_mode) {
1251 case float_round_nearest_even:
1252 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1254 case float_round_ties_away:
1257 case float_round_to_zero:
1260 case float_round_up:
1261 inc = a.sign ? 0 : rnd_mask;
1263 case float_round_down:
1264 inc = a.sign ? rnd_mask : 0;
1267 g_assert_not_reached();
1270 if (a.frac & rnd_mask) {
1271 s->float_exception_flags |= float_flag_inexact;
1273 a.frac &= ~rnd_mask;
1274 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1282 g_assert_not_reached();
1287 float16 float16_round_to_int(float16 a, float_status *s)
1289 FloatParts pa = float16_unpack_canonical(a, s);
1290 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1291 return float16_round_pack_canonical(pr, s);
1294 float32 float32_round_to_int(float32 a, float_status *s)
1296 FloatParts pa = float32_unpack_canonical(a, s);
1297 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1298 return float32_round_pack_canonical(pr, s);
1301 float64 float64_round_to_int(float64 a, float_status *s)
1303 FloatParts pa = float64_unpack_canonical(a, s);
1304 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1305 return float64_round_pack_canonical(pr, s);
1308 float64 float64_trunc_to_int(float64 a, float_status *s)
1310 FloatParts pa = float64_unpack_canonical(a, s);
1311 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1312 return float64_round_pack_canonical(pr, s);
1316 * Returns the result of converting the floating-point value `a' to
1317 * the two's complement integer format. The conversion is performed
1318 * according to the IEC/IEEE Standard for Binary Floating-Point
1319 * Arithmetic---which means in particular that the conversion is
1320 * rounded according to the current rounding mode. If `a' is a NaN,
1321 * the largest positive integer is returned. Otherwise, if the
1322 * conversion overflows, the largest integer with the same sign as `a'
1326 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1327 int64_t min, int64_t max,
1331 int orig_flags = get_float_exception_flags(s);
1332 FloatParts p = round_to_int(in, rmode, s);
1335 case float_class_snan:
1336 case float_class_qnan:
1337 case float_class_msnan:
1338 s->float_exception_flags = orig_flags | float_flag_invalid;
1340 case float_class_inf:
1341 s->float_exception_flags = orig_flags | float_flag_invalid;
1342 return p.sign ? min : max;
1343 case float_class_zero:
1345 case float_class_normal:
1346 if (p.exp < DECOMPOSED_BINARY_POINT) {
1347 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1348 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1349 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1354 if (r <= -(uint64_t) min) {
1357 s->float_exception_flags = orig_flags | float_flag_invalid;
1364 s->float_exception_flags = orig_flags | float_flag_invalid;
1369 g_assert_not_reached();
1373 #define FLOAT_TO_INT(fsz, isz) \
1374 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1377 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1378 return round_to_int_and_pack(p, s->float_rounding_mode, \
1379 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1383 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1384 (float ## fsz a, float_status *s) \
1386 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1387 return round_to_int_and_pack(p, float_round_to_zero, \
1388 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1392 FLOAT_TO_INT(16, 16)
1393 FLOAT_TO_INT(16, 32)
1394 FLOAT_TO_INT(16, 64)
1396 FLOAT_TO_INT(32, 16)
1397 FLOAT_TO_INT(32, 32)
1398 FLOAT_TO_INT(32, 64)
1400 FLOAT_TO_INT(64, 16)
1401 FLOAT_TO_INT(64, 32)
1402 FLOAT_TO_INT(64, 64)
1407 * Returns the result of converting the floating-point value `a' to
1408 * the unsigned integer format. The conversion is performed according
1409 * to the IEC/IEEE Standard for Binary Floating-Point
1410 * Arithmetic---which means in particular that the conversion is
1411 * rounded according to the current rounding mode. If `a' is a NaN,
1412 * the largest unsigned integer is returned. Otherwise, if the
1413 * conversion overflows, the largest unsigned integer is returned. If
1414 * the 'a' is negative, the result is rounded and zero is returned;
1415 * values that do not round to zero will raise the inexact exception
1419 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1422 int orig_flags = get_float_exception_flags(s);
1423 FloatParts p = round_to_int(in, rmode, s);
1426 case float_class_snan:
1427 case float_class_qnan:
1428 case float_class_msnan:
1429 s->float_exception_flags = orig_flags | float_flag_invalid;
1431 case float_class_inf:
1432 s->float_exception_flags = orig_flags | float_flag_invalid;
1433 return p.sign ? 0 : max;
1434 case float_class_zero:
1436 case float_class_normal:
1440 s->float_exception_flags = orig_flags | float_flag_invalid;
1444 if (p.exp < DECOMPOSED_BINARY_POINT) {
1445 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1446 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1447 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1449 s->float_exception_flags = orig_flags | float_flag_invalid;
1453 /* For uint64 this will never trip, but if p.exp is too large
1454 * to shift a decomposed fraction we shall have exited via the
1458 s->float_exception_flags = orig_flags | float_flag_invalid;
1465 g_assert_not_reached();
1469 #define FLOAT_TO_UINT(fsz, isz) \
1470 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1473 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1474 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1475 UINT ## isz ## _MAX, s); \
1478 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1479 (float ## fsz a, float_status *s) \
1481 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1482 return round_to_uint_and_pack(p, float_round_to_zero, \
1483 UINT ## isz ## _MAX, s); \
1486 FLOAT_TO_UINT(16, 16)
1487 FLOAT_TO_UINT(16, 32)
1488 FLOAT_TO_UINT(16, 64)
1490 FLOAT_TO_UINT(32, 16)
1491 FLOAT_TO_UINT(32, 32)
1492 FLOAT_TO_UINT(32, 64)
1494 FLOAT_TO_UINT(64, 16)
1495 FLOAT_TO_UINT(64, 32)
1496 FLOAT_TO_UINT(64, 64)
1498 #undef FLOAT_TO_UINT
1501 * Integer to float conversions
1503 * Returns the result of converting the two's complement integer `a'
1504 * to the floating-point format. The conversion is performed according
1505 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1508 static FloatParts int_to_float(int64_t a, float_status *status)
1512 r.cls = float_class_zero;
1514 } else if (a == (1ULL << 63)) {
1515 r.cls = float_class_normal;
1517 r.frac = DECOMPOSED_IMPLICIT_BIT;
1528 int shift = clz64(f) - 1;
1529 r.cls = float_class_normal;
1530 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1531 r.frac = f << shift;
1537 float16 int64_to_float16(int64_t a, float_status *status)
1539 FloatParts pa = int_to_float(a, status);
1540 return float16_round_pack_canonical(pa, status);
1543 float16 int32_to_float16(int32_t a, float_status *status)
1545 return int64_to_float16(a, status);
1548 float16 int16_to_float16(int16_t a, float_status *status)
1550 return int64_to_float16(a, status);
1553 float32 int64_to_float32(int64_t a, float_status *status)
1555 FloatParts pa = int_to_float(a, status);
1556 return float32_round_pack_canonical(pa, status);
1559 float32 int32_to_float32(int32_t a, float_status *status)
1561 return int64_to_float32(a, status);
1564 float32 int16_to_float32(int16_t a, float_status *status)
1566 return int64_to_float32(a, status);
1569 float64 int64_to_float64(int64_t a, float_status *status)
1571 FloatParts pa = int_to_float(a, status);
1572 return float64_round_pack_canonical(pa, status);
1575 float64 int32_to_float64(int32_t a, float_status *status)
1577 return int64_to_float64(a, status);
1580 float64 int16_to_float64(int16_t a, float_status *status)
1582 return int64_to_float64(a, status);
1587 * Unsigned Integer to float conversions
1589 * Returns the result of converting the unsigned integer `a' to the
1590 * floating-point format. The conversion is performed according to the
1591 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1594 static FloatParts uint_to_float(uint64_t a, float_status *status)
1596 FloatParts r = { .sign = false};
1599 r.cls = float_class_zero;
1601 int spare_bits = clz64(a) - 1;
1602 r.cls = float_class_normal;
1603 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1604 if (spare_bits < 0) {
1605 shift64RightJamming(a, -spare_bits, &a);
1608 r.frac = a << spare_bits;
1615 float16 uint64_to_float16(uint64_t a, float_status *status)
1617 FloatParts pa = uint_to_float(a, status);
1618 return float16_round_pack_canonical(pa, status);
1621 float16 uint32_to_float16(uint32_t a, float_status *status)
1623 return uint64_to_float16(a, status);
1626 float16 uint16_to_float16(uint16_t a, float_status *status)
1628 return uint64_to_float16(a, status);
1631 float32 uint64_to_float32(uint64_t a, float_status *status)
1633 FloatParts pa = uint_to_float(a, status);
1634 return float32_round_pack_canonical(pa, status);
1637 float32 uint32_to_float32(uint32_t a, float_status *status)
1639 return uint64_to_float32(a, status);
1642 float32 uint16_to_float32(uint16_t a, float_status *status)
1644 return uint64_to_float32(a, status);
1647 float64 uint64_to_float64(uint64_t a, float_status *status)
1649 FloatParts pa = uint_to_float(a, status);
1650 return float64_round_pack_canonical(pa, status);
1653 float64 uint32_to_float64(uint32_t a, float_status *status)
1655 return uint64_to_float64(a, status);
1658 float64 uint16_to_float64(uint16_t a, float_status *status)
1660 return uint64_to_float64(a, status);
1664 /* min() and max() functions. These can't be implemented as
1665 * 'compare and pick one input' because that would mishandle
1666 * NaNs and +0 vs -0.
1668 * minnum() and maxnum() functions. These are similar to the min()
1669 * and max() functions but if one of the arguments is a QNaN and
1670 * the other is numerical then the numerical argument is returned.
1671 * SNaNs will get quietened before being returned.
1672 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1673 * and maxNum() operations. min() and max() are the typical min/max
1674 * semantics provided by many CPUs which predate that specification.
1676 * minnummag() and maxnummag() functions correspond to minNumMag()
1677 * and minNumMag() from the IEEE-754 2008.
1679 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1680 bool ieee, bool ismag, float_status *s)
1682 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1684 /* Takes two floating-point values `a' and `b', one of
1685 * which is a NaN, and returns the appropriate NaN
1686 * result. If either `a' or `b' is a signaling NaN,
1687 * the invalid exception is raised.
1689 if (is_snan(a.cls) || is_snan(b.cls)) {
1690 return pick_nan(a, b, s);
1691 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1693 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697 return pick_nan(a, b, s);
1702 case float_class_normal:
1705 case float_class_inf:
1708 case float_class_zero:
1712 g_assert_not_reached();
1716 case float_class_normal:
1719 case float_class_inf:
1722 case float_class_zero:
1726 g_assert_not_reached();
1730 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1731 bool a_less = a_exp < b_exp;
1732 if (a_exp == b_exp) {
1733 a_less = a.frac < b.frac;
1735 return a_less ^ ismin ? b : a;
1738 if (a.sign == b.sign) {
1739 bool a_less = a_exp < b_exp;
1740 if (a_exp == b_exp) {
1741 a_less = a.frac < b.frac;
1743 return a.sign ^ a_less ^ ismin ? b : a;
1745 return a.sign ^ ismin ? b : a;
1750 #define MINMAX(sz, name, ismin, isiee, ismag) \
1751 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1754 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1755 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1756 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1758 return float ## sz ## _round_pack_canonical(pr, s); \
1761 MINMAX(16, min, true, false, false)
1762 MINMAX(16, minnum, true, true, false)
1763 MINMAX(16, minnummag, true, true, true)
1764 MINMAX(16, max, false, false, false)
1765 MINMAX(16, maxnum, false, true, false)
1766 MINMAX(16, maxnummag, false, true, true)
1768 MINMAX(32, min, true, false, false)
1769 MINMAX(32, minnum, true, true, false)
1770 MINMAX(32, minnummag, true, true, true)
1771 MINMAX(32, max, false, false, false)
1772 MINMAX(32, maxnum, false, true, false)
1773 MINMAX(32, maxnummag, false, true, true)
1775 MINMAX(64, min, true, false, false)
1776 MINMAX(64, minnum, true, true, false)
1777 MINMAX(64, minnummag, true, true, true)
1778 MINMAX(64, max, false, false, false)
1779 MINMAX(64, maxnum, false, true, false)
1780 MINMAX(64, maxnummag, false, true, true)
1784 /* Floating point compare */
1785 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788 if (is_nan(a.cls) || is_nan(b.cls)) {
1790 a.cls == float_class_snan ||
1791 b.cls == float_class_snan) {
1792 s->float_exception_flags |= float_flag_invalid;
1794 return float_relation_unordered;
1797 if (a.cls == float_class_zero) {
1798 if (b.cls == float_class_zero) {
1799 return float_relation_equal;
1801 return b.sign ? float_relation_greater : float_relation_less;
1802 } else if (b.cls == float_class_zero) {
1803 return a.sign ? float_relation_less : float_relation_greater;
1806 /* The only really important thing about infinity is its sign. If
1807 * both are infinities the sign marks the smallest of the two.
1809 if (a.cls == float_class_inf) {
1810 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1811 return float_relation_equal;
1813 return a.sign ? float_relation_less : float_relation_greater;
1814 } else if (b.cls == float_class_inf) {
1815 return b.sign ? float_relation_greater : float_relation_less;
1818 if (a.sign != b.sign) {
1819 return a.sign ? float_relation_less : float_relation_greater;
1822 if (a.exp == b.exp) {
1823 if (a.frac == b.frac) {
1824 return float_relation_equal;
1827 return a.frac > b.frac ?
1828 float_relation_less : float_relation_greater;
1830 return a.frac > b.frac ?
1831 float_relation_greater : float_relation_less;
1835 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1837 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1842 #define COMPARE(sz) \
1843 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1846 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1847 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1848 return compare_floats(pa, pb, false, s); \
1850 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1853 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1854 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1855 return compare_floats(pa, pb, true, s); \
1864 /* Multiply A by 2 raised to the power N. */
1865 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1867 if (unlikely(is_nan(a.cls))) {
1868 return return_nan(a, s);
1870 if (a.cls == float_class_normal) {
1871 /* The largest float type (even though not supported by FloatParts)
1872 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1873 * still allows rounding to infinity, without allowing overflow
1874 * within the int32_t that backs FloatParts.exp.
1876 n = MIN(MAX(n, -0x10000), 0x10000);
1882 float16 float16_scalbn(float16 a, int n, float_status *status)
1884 FloatParts pa = float16_unpack_canonical(a, status);
1885 FloatParts pr = scalbn_decomposed(pa, n, status);
1886 return float16_round_pack_canonical(pr, status);
1889 float32 float32_scalbn(float32 a, int n, float_status *status)
1891 FloatParts pa = float32_unpack_canonical(a, status);
1892 FloatParts pr = scalbn_decomposed(pa, n, status);
1893 return float32_round_pack_canonical(pr, status);
1896 float64 float64_scalbn(float64 a, int n, float_status *status)
1898 FloatParts pa = float64_unpack_canonical(a, status);
1899 FloatParts pr = scalbn_decomposed(pa, n, status);
1900 return float64_round_pack_canonical(pr, status);
1906 * The old softfloat code did an approximation step before zeroing in
1907 * on the final result. However for simpleness we just compute the
1908 * square root by iterating down from the implicit bit to enough extra
1909 * bits to ensure we get a correctly rounded result.
1911 * This does mean however the calculation is slower than before,
1912 * especially for 64 bit floats.
1915 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1917 uint64_t a_frac, r_frac, s_frac;
1920 if (is_nan(a.cls)) {
1921 return return_nan(a, s);
1923 if (a.cls == float_class_zero) {
1924 return a; /* sqrt(+-0) = +-0 */
1927 s->float_exception_flags |= float_flag_invalid;
1928 return parts_default_nan(s);
1930 if (a.cls == float_class_inf) {
1931 return a; /* sqrt(+inf) = +inf */
1934 assert(a.cls == float_class_normal);
1936 /* We need two overflow bits at the top. Adding room for that is a
1937 * right shift. If the exponent is odd, we can discard the low bit
1938 * by multiplying the fraction by 2; that's a left shift. Combine
1939 * those and we shift right if the exponent is even.
1947 /* Bit-by-bit computation of sqrt. */
1951 /* Iterate from implicit bit down to the 3 extra bits to compute a
1952 * properly rounded result. Remember we've inserted one more bit
1953 * at the top, so these positions are one less.
1955 bit = DECOMPOSED_BINARY_POINT - 1;
1956 last_bit = MAX(p->frac_shift - 4, 0);
1958 uint64_t q = 1ULL << bit;
1959 uint64_t t_frac = s_frac + q;
1960 if (t_frac <= a_frac) {
1961 s_frac = t_frac + q;
1966 } while (--bit >= last_bit);
1968 /* Undo the right shift done above. If there is any remaining
1969 * fraction, the result is inexact. Set the sticky bit.
1971 a.frac = (r_frac << 1) + (a_frac != 0);
1976 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1978 FloatParts pa = float16_unpack_canonical(a, status);
1979 FloatParts pr = sqrt_float(pa, status, &float16_params);
1980 return float16_round_pack_canonical(pr, status);
1983 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1985 FloatParts pa = float32_unpack_canonical(a, status);
1986 FloatParts pr = sqrt_float(pa, status, &float32_params);
1987 return float32_round_pack_canonical(pr, status);
1990 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1992 FloatParts pa = float64_unpack_canonical(a, status);
1993 FloatParts pr = sqrt_float(pa, status, &float64_params);
1994 return float64_round_pack_canonical(pr, status);
1998 /*----------------------------------------------------------------------------
1999 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2000 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2001 | input. If `zSign' is 1, the input is negated before being converted to an
2002 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2003 | is simply rounded to an integer, with the inexact exception raised if the
2004 | input cannot be represented exactly as an integer. However, if the fixed-
2005 | point input is too large, the invalid exception is raised and the largest
2006 | positive or negative integer is returned.
2007 *----------------------------------------------------------------------------*/
2009 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2011 int8_t roundingMode;
2012 flag roundNearestEven;
2013 int8_t roundIncrement, roundBits;
2016 roundingMode = status->float_rounding_mode;
2017 roundNearestEven = ( roundingMode == float_round_nearest_even );
2018 switch (roundingMode) {
2019 case float_round_nearest_even:
2020 case float_round_ties_away:
2021 roundIncrement = 0x40;
2023 case float_round_to_zero:
2026 case float_round_up:
2027 roundIncrement = zSign ? 0 : 0x7f;
2029 case float_round_down:
2030 roundIncrement = zSign ? 0x7f : 0;
2035 roundBits = absZ & 0x7F;
2036 absZ = ( absZ + roundIncrement )>>7;
2037 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2039 if ( zSign ) z = - z;
2040 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2041 float_raise(float_flag_invalid, status);
2042 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2045 status->float_exception_flags |= float_flag_inexact;
2051 /*----------------------------------------------------------------------------
2052 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2053 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2054 | and returns the properly rounded 64-bit integer corresponding to the input.
2055 | If `zSign' is 1, the input is negated before being converted to an integer.
2056 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2057 | the inexact exception raised if the input cannot be represented exactly as
2058 | an integer. However, if the fixed-point input is too large, the invalid
2059 | exception is raised and the largest positive or negative integer is
2061 *----------------------------------------------------------------------------*/
2063 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2064 float_status *status)
2066 int8_t roundingMode;
2067 flag roundNearestEven, increment;
2070 roundingMode = status->float_rounding_mode;
2071 roundNearestEven = ( roundingMode == float_round_nearest_even );
2072 switch (roundingMode) {
2073 case float_round_nearest_even:
2074 case float_round_ties_away:
2075 increment = ((int64_t) absZ1 < 0);
2077 case float_round_to_zero:
2080 case float_round_up:
2081 increment = !zSign && absZ1;
2083 case float_round_down:
2084 increment = zSign && absZ1;
2091 if ( absZ0 == 0 ) goto overflow;
2092 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2095 if ( zSign ) z = - z;
2096 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2098 float_raise(float_flag_invalid, status);
2100 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2101 : LIT64( 0x7FFFFFFFFFFFFFFF );
2104 status->float_exception_flags |= float_flag_inexact;
2110 /*----------------------------------------------------------------------------
2111 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2112 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2113 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2114 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2115 | with the inexact exception raised if the input cannot be represented exactly
2116 | as an integer. However, if the fixed-point input is too large, the invalid
2117 | exception is raised and the largest unsigned integer is returned.
2118 *----------------------------------------------------------------------------*/
2120 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2121 uint64_t absZ1, float_status *status)
2123 int8_t roundingMode;
2124 flag roundNearestEven, increment;
2126 roundingMode = status->float_rounding_mode;
2127 roundNearestEven = (roundingMode == float_round_nearest_even);
2128 switch (roundingMode) {
2129 case float_round_nearest_even:
2130 case float_round_ties_away:
2131 increment = ((int64_t)absZ1 < 0);
2133 case float_round_to_zero:
2136 case float_round_up:
2137 increment = !zSign && absZ1;
2139 case float_round_down:
2140 increment = zSign && absZ1;
2148 float_raise(float_flag_invalid, status);
2149 return LIT64(0xFFFFFFFFFFFFFFFF);
2151 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2154 if (zSign && absZ0) {
2155 float_raise(float_flag_invalid, status);
2160 status->float_exception_flags |= float_flag_inexact;
2165 /*----------------------------------------------------------------------------
2166 | If `a' is denormal and we are in flush-to-zero mode then set the
2167 | input-denormal exception and return zero. Otherwise just return the value.
2168 *----------------------------------------------------------------------------*/
2169 float32 float32_squash_input_denormal(float32 a, float_status *status)
2171 if (status->flush_inputs_to_zero) {
2172 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2173 float_raise(float_flag_input_denormal, status);
2174 return make_float32(float32_val(a) & 0x80000000);
2180 /*----------------------------------------------------------------------------
2181 | Normalizes the subnormal single-precision floating-point value represented
2182 | by the denormalized significand `aSig'. The normalized exponent and
2183 | significand are stored at the locations pointed to by `zExpPtr' and
2184 | `zSigPtr', respectively.
2185 *----------------------------------------------------------------------------*/
2188 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2192 shiftCount = countLeadingZeros32( aSig ) - 8;
2193 *zSigPtr = aSig<<shiftCount;
2194 *zExpPtr = 1 - shiftCount;
2198 /*----------------------------------------------------------------------------
2199 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2200 | and significand `zSig', and returns the proper single-precision floating-
2201 | point value corresponding to the abstract input. Ordinarily, the abstract
2202 | value is simply rounded and packed into the single-precision format, with
2203 | the inexact exception raised if the abstract input cannot be represented
2204 | exactly. However, if the abstract value is too large, the overflow and
2205 | inexact exceptions are raised and an infinity or maximal finite value is
2206 | returned. If the abstract value is too small, the input value is rounded to
2207 | a subnormal number, and the underflow and inexact exceptions are raised if
2208 | the abstract input cannot be represented exactly as a subnormal single-
2209 | precision floating-point number.
2210 | The input significand `zSig' has its binary point between bits 30
2211 | and 29, which is 7 bits to the left of the usual location. This shifted
2212 | significand must be normalized or smaller. If `zSig' is not normalized,
2213 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2214 | and it must not require rounding. In the usual case that `zSig' is
2215 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2216 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2217 | Binary Floating-Point Arithmetic.
2218 *----------------------------------------------------------------------------*/
2220 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2221 float_status *status)
2223 int8_t roundingMode;
2224 flag roundNearestEven;
2225 int8_t roundIncrement, roundBits;
2228 roundingMode = status->float_rounding_mode;
2229 roundNearestEven = ( roundingMode == float_round_nearest_even );
2230 switch (roundingMode) {
2231 case float_round_nearest_even:
2232 case float_round_ties_away:
2233 roundIncrement = 0x40;
2235 case float_round_to_zero:
2238 case float_round_up:
2239 roundIncrement = zSign ? 0 : 0x7f;
2241 case float_round_down:
2242 roundIncrement = zSign ? 0x7f : 0;
2248 roundBits = zSig & 0x7F;
2249 if ( 0xFD <= (uint16_t) zExp ) {
2250 if ( ( 0xFD < zExp )
2251 || ( ( zExp == 0xFD )
2252 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2254 float_raise(float_flag_overflow | float_flag_inexact, status);
2255 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2258 if (status->flush_to_zero) {
2259 float_raise(float_flag_output_denormal, status);
2260 return packFloat32(zSign, 0, 0);
2263 (status->float_detect_tininess
2264 == float_tininess_before_rounding)
2266 || ( zSig + roundIncrement < 0x80000000 );
2267 shift32RightJamming( zSig, - zExp, &zSig );
2269 roundBits = zSig & 0x7F;
2270 if (isTiny && roundBits) {
2271 float_raise(float_flag_underflow, status);
2276 status->float_exception_flags |= float_flag_inexact;
2278 zSig = ( zSig + roundIncrement )>>7;
2279 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2280 if ( zSig == 0 ) zExp = 0;
2281 return packFloat32( zSign, zExp, zSig );
2285 /*----------------------------------------------------------------------------
2286 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2287 | and significand `zSig', and returns the proper single-precision floating-
2288 | point value corresponding to the abstract input. This routine is just like
2289 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2290 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2291 | floating-point exponent.
2292 *----------------------------------------------------------------------------*/
2295 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2296 float_status *status)
2300 shiftCount = countLeadingZeros32( zSig ) - 1;
2301 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2306 /*----------------------------------------------------------------------------
2307 | If `a' is denormal and we are in flush-to-zero mode then set the
2308 | input-denormal exception and return zero. Otherwise just return the value.
2309 *----------------------------------------------------------------------------*/
2310 float64 float64_squash_input_denormal(float64 a, float_status *status)
2312 if (status->flush_inputs_to_zero) {
2313 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2314 float_raise(float_flag_input_denormal, status);
2315 return make_float64(float64_val(a) & (1ULL << 63));
2321 /*----------------------------------------------------------------------------
2322 | Normalizes the subnormal double-precision floating-point value represented
2323 | by the denormalized significand `aSig'. The normalized exponent and
2324 | significand are stored at the locations pointed to by `zExpPtr' and
2325 | `zSigPtr', respectively.
2326 *----------------------------------------------------------------------------*/
2329 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2333 shiftCount = countLeadingZeros64( aSig ) - 11;
2334 *zSigPtr = aSig<<shiftCount;
2335 *zExpPtr = 1 - shiftCount;
2339 /*----------------------------------------------------------------------------
2340 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2341 | double-precision floating-point value, returning the result. After being
2342 | shifted into the proper positions, the three fields are simply added
2343 | together to form the result. This means that any integer portion of `zSig'
2344 | will be added into the exponent. Since a properly normalized significand
2345 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2346 | than the desired result exponent whenever `zSig' is a complete, normalized
2348 *----------------------------------------------------------------------------*/
2350 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2353 return make_float64(
2354 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2358 /*----------------------------------------------------------------------------
2359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2360 | and significand `zSig', and returns the proper double-precision floating-
2361 | point value corresponding to the abstract input. Ordinarily, the abstract
2362 | value is simply rounded and packed into the double-precision format, with
2363 | the inexact exception raised if the abstract input cannot be represented
2364 | exactly. However, if the abstract value is too large, the overflow and
2365 | inexact exceptions are raised and an infinity or maximal finite value is
2366 | returned. If the abstract value is too small, the input value is rounded to
2367 | a subnormal number, and the underflow and inexact exceptions are raised if
2368 | the abstract input cannot be represented exactly as a subnormal double-
2369 | precision floating-point number.
2370 | The input significand `zSig' has its binary point between bits 62
2371 | and 61, which is 10 bits to the left of the usual location. This shifted
2372 | significand must be normalized or smaller. If `zSig' is not normalized,
2373 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2374 | and it must not require rounding. In the usual case that `zSig' is
2375 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2376 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2377 | Binary Floating-Point Arithmetic.
2378 *----------------------------------------------------------------------------*/
2380 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2381 float_status *status)
2383 int8_t roundingMode;
2384 flag roundNearestEven;
2385 int roundIncrement, roundBits;
2388 roundingMode = status->float_rounding_mode;
2389 roundNearestEven = ( roundingMode == float_round_nearest_even );
2390 switch (roundingMode) {
2391 case float_round_nearest_even:
2392 case float_round_ties_away:
2393 roundIncrement = 0x200;
2395 case float_round_to_zero:
2398 case float_round_up:
2399 roundIncrement = zSign ? 0 : 0x3ff;
2401 case float_round_down:
2402 roundIncrement = zSign ? 0x3ff : 0;
2404 case float_round_to_odd:
2405 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2410 roundBits = zSig & 0x3FF;
2411 if ( 0x7FD <= (uint16_t) zExp ) {
2412 if ( ( 0x7FD < zExp )
2413 || ( ( zExp == 0x7FD )
2414 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2416 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2417 roundIncrement != 0;
2418 float_raise(float_flag_overflow | float_flag_inexact, status);
2419 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2422 if (status->flush_to_zero) {
2423 float_raise(float_flag_output_denormal, status);
2424 return packFloat64(zSign, 0, 0);
2427 (status->float_detect_tininess
2428 == float_tininess_before_rounding)
2430 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2431 shift64RightJamming( zSig, - zExp, &zSig );
2433 roundBits = zSig & 0x3FF;
2434 if (isTiny && roundBits) {
2435 float_raise(float_flag_underflow, status);
2437 if (roundingMode == float_round_to_odd) {
2439 * For round-to-odd case, the roundIncrement depends on
2440 * zSig which just changed.
2442 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2447 status->float_exception_flags |= float_flag_inexact;
2449 zSig = ( zSig + roundIncrement )>>10;
2450 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2451 if ( zSig == 0 ) zExp = 0;
2452 return packFloat64( zSign, zExp, zSig );
2456 /*----------------------------------------------------------------------------
2457 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2458 | and significand `zSig', and returns the proper double-precision floating-
2459 | point value corresponding to the abstract input. This routine is just like
2460 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2461 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2462 | floating-point exponent.
2463 *----------------------------------------------------------------------------*/
2466 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2467 float_status *status)
2471 shiftCount = countLeadingZeros64( zSig ) - 1;
2472 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2477 /*----------------------------------------------------------------------------
2478 | Normalizes the subnormal extended double-precision floating-point value
2479 | represented by the denormalized significand `aSig'. The normalized exponent
2480 | and significand are stored at the locations pointed to by `zExpPtr' and
2481 | `zSigPtr', respectively.
2482 *----------------------------------------------------------------------------*/
2484 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2489 shiftCount = countLeadingZeros64( aSig );
2490 *zSigPtr = aSig<<shiftCount;
2491 *zExpPtr = 1 - shiftCount;
2494 /*----------------------------------------------------------------------------
2495 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2496 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2497 | and returns the proper extended double-precision floating-point value
2498 | corresponding to the abstract input. Ordinarily, the abstract value is
2499 | rounded and packed into the extended double-precision format, with the
2500 | inexact exception raised if the abstract input cannot be represented
2501 | exactly. However, if the abstract value is too large, the overflow and
2502 | inexact exceptions are raised and an infinity or maximal finite value is
2503 | returned. If the abstract value is too small, the input value is rounded to
2504 | a subnormal number, and the underflow and inexact exceptions are raised if
2505 | the abstract input cannot be represented exactly as a subnormal extended
2506 | double-precision floating-point number.
2507 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2508 | number of bits as single or double precision, respectively. Otherwise, the
2509 | result is rounded to the full precision of the extended double-precision
2511 | The input significand must be normalized or smaller. If the input
2512 | significand is not normalized, `zExp' must be 0; in that case, the result
2513 | returned is a subnormal number, and it must not require rounding. The
2514 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2515 | Floating-Point Arithmetic.
2516 *----------------------------------------------------------------------------*/
2518 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2519 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2520 float_status *status)
2522 int8_t roundingMode;
2523 flag roundNearestEven, increment, isTiny;
2524 int64_t roundIncrement, roundMask, roundBits;
2526 roundingMode = status->float_rounding_mode;
2527 roundNearestEven = ( roundingMode == float_round_nearest_even );
2528 if ( roundingPrecision == 80 ) goto precision80;
2529 if ( roundingPrecision == 64 ) {
2530 roundIncrement = LIT64( 0x0000000000000400 );
2531 roundMask = LIT64( 0x00000000000007FF );
2533 else if ( roundingPrecision == 32 ) {
2534 roundIncrement = LIT64( 0x0000008000000000 );
2535 roundMask = LIT64( 0x000000FFFFFFFFFF );
2540 zSig0 |= ( zSig1 != 0 );
2541 switch (roundingMode) {
2542 case float_round_nearest_even:
2543 case float_round_ties_away:
2545 case float_round_to_zero:
2548 case float_round_up:
2549 roundIncrement = zSign ? 0 : roundMask;
2551 case float_round_down:
2552 roundIncrement = zSign ? roundMask : 0;
2557 roundBits = zSig0 & roundMask;
2558 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2559 if ( ( 0x7FFE < zExp )
2560 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2565 if (status->flush_to_zero) {
2566 float_raise(float_flag_output_denormal, status);
2567 return packFloatx80(zSign, 0, 0);
2570 (status->float_detect_tininess
2571 == float_tininess_before_rounding)
2573 || ( zSig0 <= zSig0 + roundIncrement );
2574 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2576 roundBits = zSig0 & roundMask;
2577 if (isTiny && roundBits) {
2578 float_raise(float_flag_underflow, status);
2581 status->float_exception_flags |= float_flag_inexact;
2583 zSig0 += roundIncrement;
2584 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2585 roundIncrement = roundMask + 1;
2586 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2587 roundMask |= roundIncrement;
2589 zSig0 &= ~ roundMask;
2590 return packFloatx80( zSign, zExp, zSig0 );
2594 status->float_exception_flags |= float_flag_inexact;
2596 zSig0 += roundIncrement;
2597 if ( zSig0 < roundIncrement ) {
2599 zSig0 = LIT64( 0x8000000000000000 );
2601 roundIncrement = roundMask + 1;
2602 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2603 roundMask |= roundIncrement;
2605 zSig0 &= ~ roundMask;
2606 if ( zSig0 == 0 ) zExp = 0;
2607 return packFloatx80( zSign, zExp, zSig0 );
2609 switch (roundingMode) {
2610 case float_round_nearest_even:
2611 case float_round_ties_away:
2612 increment = ((int64_t)zSig1 < 0);
2614 case float_round_to_zero:
2617 case float_round_up:
2618 increment = !zSign && zSig1;
2620 case float_round_down:
2621 increment = zSign && zSig1;
2626 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2627 if ( ( 0x7FFE < zExp )
2628 || ( ( zExp == 0x7FFE )
2629 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2635 float_raise(float_flag_overflow | float_flag_inexact, status);
2636 if ( ( roundingMode == float_round_to_zero )
2637 || ( zSign && ( roundingMode == float_round_up ) )
2638 || ( ! zSign && ( roundingMode == float_round_down ) )
2640 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2642 return packFloatx80(zSign,
2643 floatx80_infinity_high,
2644 floatx80_infinity_low);
2648 (status->float_detect_tininess
2649 == float_tininess_before_rounding)
2652 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2653 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2655 if (isTiny && zSig1) {
2656 float_raise(float_flag_underflow, status);
2659 status->float_exception_flags |= float_flag_inexact;
2661 switch (roundingMode) {
2662 case float_round_nearest_even:
2663 case float_round_ties_away:
2664 increment = ((int64_t)zSig1 < 0);
2666 case float_round_to_zero:
2669 case float_round_up:
2670 increment = !zSign && zSig1;
2672 case float_round_down:
2673 increment = zSign && zSig1;
2681 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2682 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2684 return packFloatx80( zSign, zExp, zSig0 );
2688 status->float_exception_flags |= float_flag_inexact;
2694 zSig0 = LIT64( 0x8000000000000000 );
2697 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2701 if ( zSig0 == 0 ) zExp = 0;
2703 return packFloatx80( zSign, zExp, zSig0 );
2707 /*----------------------------------------------------------------------------
2708 | Takes an abstract floating-point value having sign `zSign', exponent
2709 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2710 | and returns the proper extended double-precision floating-point value
2711 | corresponding to the abstract input. This routine is just like
2712 | `roundAndPackFloatx80' except that the input significand does not have to be
2714 *----------------------------------------------------------------------------*/
2716 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2717 flag zSign, int32_t zExp,
2718 uint64_t zSig0, uint64_t zSig1,
2719 float_status *status)
2728 shiftCount = countLeadingZeros64( zSig0 );
2729 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2731 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2732 zSig0, zSig1, status);
2736 /*----------------------------------------------------------------------------
2737 | Returns the least-significant 64 fraction bits of the quadruple-precision
2738 | floating-point value `a'.
2739 *----------------------------------------------------------------------------*/
2741 static inline uint64_t extractFloat128Frac1( float128 a )
2748 /*----------------------------------------------------------------------------
2749 | Returns the most-significant 48 fraction bits of the quadruple-precision
2750 | floating-point value `a'.
2751 *----------------------------------------------------------------------------*/
2753 static inline uint64_t extractFloat128Frac0( float128 a )
2756 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2760 /*----------------------------------------------------------------------------
2761 | Returns the exponent bits of the quadruple-precision floating-point value
2763 *----------------------------------------------------------------------------*/
2765 static inline int32_t extractFloat128Exp( float128 a )
2768 return ( a.high>>48 ) & 0x7FFF;
2772 /*----------------------------------------------------------------------------
2773 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2774 *----------------------------------------------------------------------------*/
2776 static inline flag extractFloat128Sign( float128 a )
2783 /*----------------------------------------------------------------------------
2784 | Normalizes the subnormal quadruple-precision floating-point value
2785 | represented by the denormalized significand formed by the concatenation of
2786 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2787 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2788 | significand are stored at the location pointed to by `zSig0Ptr', and the
2789 | least significant 64 bits of the normalized significand are stored at the
2790 | location pointed to by `zSig1Ptr'.
2791 *----------------------------------------------------------------------------*/
2794 normalizeFloat128Subnormal(
2805 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2806 if ( shiftCount < 0 ) {
2807 *zSig0Ptr = aSig1>>( - shiftCount );
2808 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2811 *zSig0Ptr = aSig1<<shiftCount;
2814 *zExpPtr = - shiftCount - 63;
2817 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2818 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2819 *zExpPtr = 1 - shiftCount;
2824 /*----------------------------------------------------------------------------
2825 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2826 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2827 | floating-point value, returning the result. After being shifted into the
2828 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2829 | added together to form the most significant 32 bits of the result. This
2830 | means that any integer portion of `zSig0' will be added into the exponent.
2831 | Since a properly normalized significand will have an integer portion equal
2832 | to 1, the `zExp' input should be 1 less than the desired result exponent
2833 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2835 *----------------------------------------------------------------------------*/
2837 static inline float128
2838 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2843 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2848 /*----------------------------------------------------------------------------
2849 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2850 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2851 | and `zSig2', and returns the proper quadruple-precision floating-point value
2852 | corresponding to the abstract input. Ordinarily, the abstract value is
2853 | simply rounded and packed into the quadruple-precision format, with the
2854 | inexact exception raised if the abstract input cannot be represented
2855 | exactly. However, if the abstract value is too large, the overflow and
2856 | inexact exceptions are raised and an infinity or maximal finite value is
2857 | returned. If the abstract value is too small, the input value is rounded to
2858 | a subnormal number, and the underflow and inexact exceptions are raised if
2859 | the abstract input cannot be represented exactly as a subnormal quadruple-
2860 | precision floating-point number.
2861 | The input significand must be normalized or smaller. If the input
2862 | significand is not normalized, `zExp' must be 0; in that case, the result
2863 | returned is a subnormal number, and it must not require rounding. In the
2864 | usual case that the input significand is normalized, `zExp' must be 1 less
2865 | than the ``true'' floating-point exponent. The handling of underflow and
2866 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2869 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2870 uint64_t zSig0, uint64_t zSig1,
2871 uint64_t zSig2, float_status *status)
2873 int8_t roundingMode;
2874 flag roundNearestEven, increment, isTiny;
2876 roundingMode = status->float_rounding_mode;
2877 roundNearestEven = ( roundingMode == float_round_nearest_even );
2878 switch (roundingMode) {
2879 case float_round_nearest_even:
2880 case float_round_ties_away:
2881 increment = ((int64_t)zSig2 < 0);
2883 case float_round_to_zero:
2886 case float_round_up:
2887 increment = !zSign && zSig2;
2889 case float_round_down:
2890 increment = zSign && zSig2;
2892 case float_round_to_odd:
2893 increment = !(zSig1 & 0x1) && zSig2;
2898 if ( 0x7FFD <= (uint32_t) zExp ) {
2899 if ( ( 0x7FFD < zExp )
2900 || ( ( zExp == 0x7FFD )
2902 LIT64( 0x0001FFFFFFFFFFFF ),
2903 LIT64( 0xFFFFFFFFFFFFFFFF ),
2910 float_raise(float_flag_overflow | float_flag_inexact, status);
2911 if ( ( roundingMode == float_round_to_zero )
2912 || ( zSign && ( roundingMode == float_round_up ) )
2913 || ( ! zSign && ( roundingMode == float_round_down ) )
2914 || (roundingMode == float_round_to_odd)
2920 LIT64( 0x0000FFFFFFFFFFFF ),
2921 LIT64( 0xFFFFFFFFFFFFFFFF )
2924 return packFloat128( zSign, 0x7FFF, 0, 0 );
2927 if (status->flush_to_zero) {
2928 float_raise(float_flag_output_denormal, status);
2929 return packFloat128(zSign, 0, 0, 0);
2932 (status->float_detect_tininess
2933 == float_tininess_before_rounding)
2939 LIT64( 0x0001FFFFFFFFFFFF ),
2940 LIT64( 0xFFFFFFFFFFFFFFFF )
2942 shift128ExtraRightJamming(
2943 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2945 if (isTiny && zSig2) {
2946 float_raise(float_flag_underflow, status);
2948 switch (roundingMode) {
2949 case float_round_nearest_even:
2950 case float_round_ties_away:
2951 increment = ((int64_t)zSig2 < 0);
2953 case float_round_to_zero:
2956 case float_round_up:
2957 increment = !zSign && zSig2;
2959 case float_round_down:
2960 increment = zSign && zSig2;
2962 case float_round_to_odd:
2963 increment = !(zSig1 & 0x1) && zSig2;
2971 status->float_exception_flags |= float_flag_inexact;
2974 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2975 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2978 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2980 return packFloat128( zSign, zExp, zSig0, zSig1 );
2984 /*----------------------------------------------------------------------------
2985 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2986 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2987 | returns the proper quadruple-precision floating-point value corresponding
2988 | to the abstract input. This routine is just like `roundAndPackFloat128'
2989 | except that the input significand has fewer bits and does not have to be
2990 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2992 *----------------------------------------------------------------------------*/
2994 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2995 uint64_t zSig0, uint64_t zSig1,
2996 float_status *status)
3006 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3007 if ( 0 <= shiftCount ) {
3009 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3012 shift128ExtraRightJamming(
3013 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3016 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3021 /*----------------------------------------------------------------------------
3022 | Returns the result of converting the 32-bit two's complement integer `a'
3023 | to the extended double-precision floating-point format. The conversion
3024 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3026 *----------------------------------------------------------------------------*/
3028 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3035 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3037 absA = zSign ? - a : a;
3038 shiftCount = countLeadingZeros32( absA ) + 32;
3040 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3044 /*----------------------------------------------------------------------------
3045 | Returns the result of converting the 32-bit two's complement integer `a' to
3046 | the quadruple-precision floating-point format. The conversion is performed
3047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3048 *----------------------------------------------------------------------------*/
3050 float128 int32_to_float128(int32_t a, float_status *status)
3057 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3059 absA = zSign ? - a : a;
3060 shiftCount = countLeadingZeros32( absA ) + 17;
3062 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3066 /*----------------------------------------------------------------------------
3067 | Returns the result of converting the 64-bit two's complement integer `a'
3068 | to the extended double-precision floating-point format. The conversion
3069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3071 *----------------------------------------------------------------------------*/
3073 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3079 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3081 absA = zSign ? - a : a;
3082 shiftCount = countLeadingZeros64( absA );
3083 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3087 /*----------------------------------------------------------------------------
3088 | Returns the result of converting the 64-bit two's complement integer `a' to
3089 | the quadruple-precision floating-point format. The conversion is performed
3090 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3091 *----------------------------------------------------------------------------*/
3093 float128 int64_to_float128(int64_t a, float_status *status)
3099 uint64_t zSig0, zSig1;
3101 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3103 absA = zSign ? - a : a;
3104 shiftCount = countLeadingZeros64( absA ) + 49;
3105 zExp = 0x406E - shiftCount;
3106 if ( 64 <= shiftCount ) {
3115 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3116 return packFloat128( zSign, zExp, zSig0, zSig1 );
3120 /*----------------------------------------------------------------------------
3121 | Returns the result of converting the 64-bit unsigned integer `a'
3122 | to the quadruple-precision floating-point format. The conversion is performed
3123 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3124 *----------------------------------------------------------------------------*/
3126 float128 uint64_to_float128(uint64_t a, float_status *status)
3129 return float128_zero;
3131 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3137 /*----------------------------------------------------------------------------
3138 | Returns the result of converting the single-precision floating-point value
3139 | `a' to the double-precision floating-point format. The conversion is
3140 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3142 *----------------------------------------------------------------------------*/
3144 float64 float32_to_float64(float32 a, float_status *status)
3149 a = float32_squash_input_denormal(a, status);
3151 aSig = extractFloat32Frac( a );
3152 aExp = extractFloat32Exp( a );
3153 aSign = extractFloat32Sign( a );
3154 if ( aExp == 0xFF ) {
3156 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3158 return packFloat64( aSign, 0x7FF, 0 );
3161 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3162 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3165 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3169 /*----------------------------------------------------------------------------
3170 | Returns the result of converting the single-precision floating-point value
3171 | `a' to the extended double-precision floating-point format. The conversion
3172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3174 *----------------------------------------------------------------------------*/
3176 floatx80 float32_to_floatx80(float32 a, float_status *status)
3182 a = float32_squash_input_denormal(a, status);
3183 aSig = extractFloat32Frac( a );
3184 aExp = extractFloat32Exp( a );
3185 aSign = extractFloat32Sign( a );
3186 if ( aExp == 0xFF ) {
3188 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3190 return packFloatx80(aSign,
3191 floatx80_infinity_high,
3192 floatx80_infinity_low);
3195 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3196 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3199 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3203 /*----------------------------------------------------------------------------
3204 | Returns the result of converting the single-precision floating-point value
3205 | `a' to the double-precision floating-point format. The conversion is
3206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3208 *----------------------------------------------------------------------------*/
3210 float128 float32_to_float128(float32 a, float_status *status)
3216 a = float32_squash_input_denormal(a, status);
3217 aSig = extractFloat32Frac( a );
3218 aExp = extractFloat32Exp( a );
3219 aSign = extractFloat32Sign( a );
3220 if ( aExp == 0xFF ) {
3222 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3224 return packFloat128( aSign, 0x7FFF, 0, 0 );
3227 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3228 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3231 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3235 /*----------------------------------------------------------------------------
3236 | Returns the remainder of the single-precision floating-point value `a'
3237 | with respect to the corresponding value `b'. The operation is performed
3238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3241 float32 float32_rem(float32 a, float32 b, float_status *status)
3244 int aExp, bExp, expDiff;
3245 uint32_t aSig, bSig;
3247 uint64_t aSig64, bSig64, q64;
3248 uint32_t alternateASig;
3250 a = float32_squash_input_denormal(a, status);
3251 b = float32_squash_input_denormal(b, status);
3253 aSig = extractFloat32Frac( a );
3254 aExp = extractFloat32Exp( a );
3255 aSign = extractFloat32Sign( a );
3256 bSig = extractFloat32Frac( b );
3257 bExp = extractFloat32Exp( b );
3258 if ( aExp == 0xFF ) {
3259 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3260 return propagateFloat32NaN(a, b, status);
3262 float_raise(float_flag_invalid, status);
3263 return float32_default_nan(status);
3265 if ( bExp == 0xFF ) {
3267 return propagateFloat32NaN(a, b, status);
3273 float_raise(float_flag_invalid, status);
3274 return float32_default_nan(status);
3276 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3279 if ( aSig == 0 ) return a;
3280 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3282 expDiff = aExp - bExp;
3285 if ( expDiff < 32 ) {
3288 if ( expDiff < 0 ) {
3289 if ( expDiff < -1 ) return a;
3292 q = ( bSig <= aSig );
3293 if ( q ) aSig -= bSig;
3294 if ( 0 < expDiff ) {
3295 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3298 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3306 if ( bSig <= aSig ) aSig -= bSig;
3307 aSig64 = ( (uint64_t) aSig )<<40;
3308 bSig64 = ( (uint64_t) bSig )<<40;
3310 while ( 0 < expDiff ) {
3311 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3312 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3313 aSig64 = - ( ( bSig * q64 )<<38 );
3317 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3318 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3319 q = q64>>( 64 - expDiff );
3321 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3324 alternateASig = aSig;
3327 } while ( 0 <= (int32_t) aSig );
3328 sigMean = aSig + alternateASig;
3329 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3330 aSig = alternateASig;
3332 zSign = ( (int32_t) aSig < 0 );
3333 if ( zSign ) aSig = - aSig;
3334 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3339 /*----------------------------------------------------------------------------
3340 | Returns the binary exponential of the single-precision floating-point value
3341 | `a'. The operation is performed according to the IEC/IEEE Standard for
3342 | Binary Floating-Point Arithmetic.
3344 | Uses the following identities:
3346 | 1. -------------------------------------------------------------------------
3350 | 2. -------------------------------------------------------------------------
3353 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3355 *----------------------------------------------------------------------------*/
3357 static const float64 float32_exp2_coefficients[15] =
3359 const_float64( 0x3ff0000000000000ll ), /* 1 */
3360 const_float64( 0x3fe0000000000000ll ), /* 2 */
3361 const_float64( 0x3fc5555555555555ll ), /* 3 */
3362 const_float64( 0x3fa5555555555555ll ), /* 4 */
3363 const_float64( 0x3f81111111111111ll ), /* 5 */
3364 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3365 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3366 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3367 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3368 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3369 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3370 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3371 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3372 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3373 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3376 float32 float32_exp2(float32 a, float_status *status)
3383 a = float32_squash_input_denormal(a, status);
3385 aSig = extractFloat32Frac( a );
3386 aExp = extractFloat32Exp( a );
3387 aSign = extractFloat32Sign( a );
3389 if ( aExp == 0xFF) {
3391 return propagateFloat32NaN(a, float32_zero, status);
3393 return (aSign) ? float32_zero : a;
3396 if (aSig == 0) return float32_one;
3399 float_raise(float_flag_inexact, status);
3401 /* ******************************* */
3402 /* using float64 for approximation */
3403 /* ******************************* */
3404 x = float32_to_float64(a, status);
3405 x = float64_mul(x, float64_ln2, status);
3409 for (i = 0 ; i < 15 ; i++) {
3412 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3413 r = float64_add(r, f, status);
3415 xn = float64_mul(xn, x, status);
3418 return float64_to_float32(r, status);
3421 /*----------------------------------------------------------------------------
3422 | Returns the binary log of the single-precision floating-point value `a'.
3423 | The operation is performed according to the IEC/IEEE Standard for Binary
3424 | Floating-Point Arithmetic.
3425 *----------------------------------------------------------------------------*/
3426 float32 float32_log2(float32 a, float_status *status)
3430 uint32_t aSig, zSig, i;
3432 a = float32_squash_input_denormal(a, status);
3433 aSig = extractFloat32Frac( a );
3434 aExp = extractFloat32Exp( a );
3435 aSign = extractFloat32Sign( a );
3438 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3439 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3442 float_raise(float_flag_invalid, status);
3443 return float32_default_nan(status);
3445 if ( aExp == 0xFF ) {
3447 return propagateFloat32NaN(a, float32_zero, status);
3457 for (i = 1 << 22; i > 0; i >>= 1) {
3458 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3459 if ( aSig & 0x01000000 ) {
3468 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3471 /*----------------------------------------------------------------------------
3472 | Returns 1 if the single-precision floating-point value `a' is equal to
3473 | the corresponding value `b', and 0 otherwise. The invalid exception is
3474 | raised if either operand is a NaN. Otherwise, the comparison is performed
3475 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3476 *----------------------------------------------------------------------------*/
3478 int float32_eq(float32 a, float32 b, float_status *status)
3481 a = float32_squash_input_denormal(a, status);
3482 b = float32_squash_input_denormal(b, status);
3484 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3485 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3487 float_raise(float_flag_invalid, status);
3490 av = float32_val(a);
3491 bv = float32_val(b);
3492 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3495 /*----------------------------------------------------------------------------
3496 | Returns 1 if the single-precision floating-point value `a' is less than
3497 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3498 | exception is raised if either operand is a NaN. The comparison is performed
3499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3500 *----------------------------------------------------------------------------*/
3502 int float32_le(float32 a, float32 b, float_status *status)
3506 a = float32_squash_input_denormal(a, status);
3507 b = float32_squash_input_denormal(b, status);
3509 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3510 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3512 float_raise(float_flag_invalid, status);
3515 aSign = extractFloat32Sign( a );
3516 bSign = extractFloat32Sign( b );
3517 av = float32_val(a);
3518 bv = float32_val(b);
3519 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3520 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3524 /*----------------------------------------------------------------------------
3525 | Returns 1 if the single-precision floating-point value `a' is less than
3526 | the corresponding value `b', and 0 otherwise. The invalid exception is
3527 | raised if either operand is a NaN. The comparison is performed according
3528 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3529 *----------------------------------------------------------------------------*/
3531 int float32_lt(float32 a, float32 b, float_status *status)
3535 a = float32_squash_input_denormal(a, status);
3536 b = float32_squash_input_denormal(b, status);
3538 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3539 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3541 float_raise(float_flag_invalid, status);
3544 aSign = extractFloat32Sign( a );
3545 bSign = extractFloat32Sign( b );
3546 av = float32_val(a);
3547 bv = float32_val(b);
3548 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3549 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3553 /*----------------------------------------------------------------------------
3554 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3555 | be compared, and 0 otherwise. The invalid exception is raised if either
3556 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3557 | Standard for Binary Floating-Point Arithmetic.
3558 *----------------------------------------------------------------------------*/
3560 int float32_unordered(float32 a, float32 b, float_status *status)
3562 a = float32_squash_input_denormal(a, status);
3563 b = float32_squash_input_denormal(b, status);
3565 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3566 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3568 float_raise(float_flag_invalid, status);
3574 /*----------------------------------------------------------------------------
3575 | Returns 1 if the single-precision floating-point value `a' is equal to
3576 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3577 | exception. The comparison is performed according to the IEC/IEEE Standard
3578 | for Binary Floating-Point Arithmetic.
3579 *----------------------------------------------------------------------------*/
3581 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3583 a = float32_squash_input_denormal(a, status);
3584 b = float32_squash_input_denormal(b, status);
3586 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3587 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3589 if (float32_is_signaling_nan(a, status)
3590 || float32_is_signaling_nan(b, status)) {
3591 float_raise(float_flag_invalid, status);
3595 return ( float32_val(a) == float32_val(b) ) ||
3596 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3599 /*----------------------------------------------------------------------------
3600 | Returns 1 if the single-precision floating-point value `a' is less than or
3601 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3602 | cause an exception. Otherwise, the comparison is performed according to the
3603 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3606 int float32_le_quiet(float32 a, float32 b, float_status *status)
3610 a = float32_squash_input_denormal(a, status);
3611 b = float32_squash_input_denormal(b, status);
3613 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3614 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3616 if (float32_is_signaling_nan(a, status)
3617 || float32_is_signaling_nan(b, status)) {
3618 float_raise(float_flag_invalid, status);
3622 aSign = extractFloat32Sign( a );
3623 bSign = extractFloat32Sign( b );
3624 av = float32_val(a);
3625 bv = float32_val(b);
3626 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3627 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3631 /*----------------------------------------------------------------------------
3632 | Returns 1 if the single-precision floating-point value `a' is less than
3633 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3634 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3635 | Standard for Binary Floating-Point Arithmetic.
3636 *----------------------------------------------------------------------------*/
3638 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3642 a = float32_squash_input_denormal(a, status);
3643 b = float32_squash_input_denormal(b, status);
3645 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3646 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3648 if (float32_is_signaling_nan(a, status)
3649 || float32_is_signaling_nan(b, status)) {
3650 float_raise(float_flag_invalid, status);
3654 aSign = extractFloat32Sign( a );
3655 bSign = extractFloat32Sign( b );
3656 av = float32_val(a);
3657 bv = float32_val(b);
3658 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3659 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3663 /*----------------------------------------------------------------------------
3664 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3665 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3666 | comparison is performed according to the IEC/IEEE Standard for Binary
3667 | Floating-Point Arithmetic.
3668 *----------------------------------------------------------------------------*/
3670 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3672 a = float32_squash_input_denormal(a, status);
3673 b = float32_squash_input_denormal(b, status);
3675 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3676 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3678 if (float32_is_signaling_nan(a, status)
3679 || float32_is_signaling_nan(b, status)) {
3680 float_raise(float_flag_invalid, status);
3688 /*----------------------------------------------------------------------------
3689 | Returns the result of converting the double-precision floating-point value
3690 | `a' to the single-precision floating-point format. The conversion is
3691 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3693 *----------------------------------------------------------------------------*/
3695 float32 float64_to_float32(float64 a, float_status *status)
3701 a = float64_squash_input_denormal(a, status);
3703 aSig = extractFloat64Frac( a );
3704 aExp = extractFloat64Exp( a );
3705 aSign = extractFloat64Sign( a );
3706 if ( aExp == 0x7FF ) {
3708 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3710 return packFloat32( aSign, 0xFF, 0 );
3712 shift64RightJamming( aSig, 22, &aSig );
3714 if ( aExp || zSig ) {
3718 return roundAndPackFloat32(aSign, aExp, zSig, status);
3723 /*----------------------------------------------------------------------------
3724 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3725 | half-precision floating-point value, returning the result. After being
3726 | shifted into the proper positions, the three fields are simply added
3727 | together to form the result. This means that any integer portion of `zSig'
3728 | will be added into the exponent. Since a properly normalized significand
3729 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3730 | than the desired result exponent whenever `zSig' is a complete, normalized
3732 *----------------------------------------------------------------------------*/
3733 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3735 return make_float16(
3736 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3739 /*----------------------------------------------------------------------------
3740 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3741 | and significand `zSig', and returns the proper half-precision floating-
3742 | point value corresponding to the abstract input. Ordinarily, the abstract
3743 | value is simply rounded and packed into the half-precision format, with
3744 | the inexact exception raised if the abstract input cannot be represented
3745 | exactly. However, if the abstract value is too large, the overflow and
3746 | inexact exceptions are raised and an infinity or maximal finite value is
3747 | returned. If the abstract value is too small, the input value is rounded to
3748 | a subnormal number, and the underflow and inexact exceptions are raised if
3749 | the abstract input cannot be represented exactly as a subnormal half-
3750 | precision floating-point number.
3751 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3752 | ARM-style "alternative representation", which omits the NaN and Inf
3753 | encodings in order to raise the maximum representable exponent by one.
3754 | The input significand `zSig' has its binary point between bits 22
3755 | and 23, which is 13 bits to the left of the usual location. This shifted
3756 | significand must be normalized or smaller. If `zSig' is not normalized,
3757 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3758 | and it must not require rounding. In the usual case that `zSig' is
3759 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3760 | Note the slightly odd position of the binary point in zSig compared with the
3761 | other roundAndPackFloat functions. This should probably be fixed if we
3762 | need to implement more float16 routines than just conversion.
3763 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3764 | Binary Floating-Point Arithmetic.
3765 *----------------------------------------------------------------------------*/
3767 static float16 roundAndPackFloat16(flag zSign, int zExp,
3768 uint32_t zSig, flag ieee,
3769 float_status *status)
3771 int maxexp = ieee ? 29 : 30;
3774 bool rounding_bumps_exp;
3775 bool is_tiny = false;
3777 /* Calculate the mask of bits of the mantissa which are not
3778 * representable in half-precision and will be lost.
3781 /* Will be denormal in halfprec */
3787 /* Normal number in halfprec */
3791 switch (status->float_rounding_mode) {
3792 case float_round_nearest_even:
3793 increment = (mask + 1) >> 1;
3794 if ((zSig & mask) == increment) {
3795 increment = zSig & (increment << 1);
3798 case float_round_ties_away:
3799 increment = (mask + 1) >> 1;
3801 case float_round_up:
3802 increment = zSign ? 0 : mask;
3804 case float_round_down:
3805 increment = zSign ? mask : 0;
3807 default: /* round_to_zero */
3812 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3814 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3816 float_raise(float_flag_overflow | float_flag_inexact, status);
3817 return packFloat16(zSign, 0x1f, 0);
3819 float_raise(float_flag_invalid, status);
3820 return packFloat16(zSign, 0x1f, 0x3ff);
3825 /* Note that flush-to-zero does not affect half-precision results */
3827 (status->float_detect_tininess == float_tininess_before_rounding)
3829 || (!rounding_bumps_exp);
3832 float_raise(float_flag_inexact, status);
3834 float_raise(float_flag_underflow, status);
3839 if (rounding_bumps_exp) {
3845 return packFloat16(zSign, 0, 0);
3851 return packFloat16(zSign, zExp, zSig >> 13);
3854 /*----------------------------------------------------------------------------
3855 | If `a' is denormal and we are in flush-to-zero mode then set the
3856 | input-denormal exception and return zero. Otherwise just return the value.
3857 *----------------------------------------------------------------------------*/
3858 float16 float16_squash_input_denormal(float16 a, float_status *status)
3860 if (status->flush_inputs_to_zero) {
3861 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3862 float_raise(float_flag_input_denormal, status);
3863 return make_float16(float16_val(a) & 0x8000);
3869 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3872 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3873 *zSigPtr = aSig << shiftCount;
3874 *zExpPtr = 1 - shiftCount;
3877 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3878 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3880 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3886 aSign = extractFloat16Sign(a);
3887 aExp = extractFloat16Exp(a);
3888 aSig = extractFloat16Frac(a);
3890 if (aExp == 0x1f && ieee) {
3892 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3894 return packFloat32(aSign, 0xff, 0);
3898 return packFloat32(aSign, 0, 0);
3901 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3904 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3907 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3913 a = float32_squash_input_denormal(a, status);
3915 aSig = extractFloat32Frac( a );
3916 aExp = extractFloat32Exp( a );
3917 aSign = extractFloat32Sign( a );
3918 if ( aExp == 0xFF ) {
3920 /* Input is a NaN */
3922 float_raise(float_flag_invalid, status);
3923 return packFloat16(aSign, 0, 0);
3925 return commonNaNToFloat16(
3926 float32ToCommonNaN(a, status), status);
3930 float_raise(float_flag_invalid, status);
3931 return packFloat16(aSign, 0x1f, 0x3ff);
3933 return packFloat16(aSign, 0x1f, 0);
3935 if (aExp == 0 && aSig == 0) {
3936 return packFloat16(aSign, 0, 0);
3938 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3939 * even if the input is denormal; however this is harmless because
3940 * the largest possible single-precision denormal is still smaller
3941 * than the smallest representable half-precision denormal, and so we
3942 * will end up ignoring aSig and returning via the "always return zero"
3948 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3951 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3957 aSign = extractFloat16Sign(a);
3958 aExp = extractFloat16Exp(a);
3959 aSig = extractFloat16Frac(a);
3961 if (aExp == 0x1f && ieee) {
3963 return commonNaNToFloat64(
3964 float16ToCommonNaN(a, status), status);
3966 return packFloat64(aSign, 0x7ff, 0);
3970 return packFloat64(aSign, 0, 0);
3973 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3976 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3979 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3986 a = float64_squash_input_denormal(a, status);
3988 aSig = extractFloat64Frac(a);
3989 aExp = extractFloat64Exp(a);
3990 aSign = extractFloat64Sign(a);
3991 if (aExp == 0x7FF) {
3993 /* Input is a NaN */
3995 float_raise(float_flag_invalid, status);
3996 return packFloat16(aSign, 0, 0);
3998 return commonNaNToFloat16(
3999 float64ToCommonNaN(a, status), status);
4003 float_raise(float_flag_invalid, status);
4004 return packFloat16(aSign, 0x1f, 0x3ff);
4006 return packFloat16(aSign, 0x1f, 0);
4008 shift64RightJamming(aSig, 29, &aSig);
4010 if (aExp == 0 && zSig == 0) {
4011 return packFloat16(aSign, 0, 0);
4013 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4014 * even if the input is denormal; however this is harmless because
4015 * the largest possible single-precision denormal is still smaller
4016 * than the smallest representable half-precision denormal, and so we
4017 * will end up ignoring aSig and returning via the "always return zero"
4023 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4026 /*----------------------------------------------------------------------------
4027 | Returns the result of converting the double-precision floating-point value
4028 | `a' to the extended double-precision floating-point format. The conversion
4029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4031 *----------------------------------------------------------------------------*/
4033 floatx80 float64_to_floatx80(float64 a, float_status *status)
4039 a = float64_squash_input_denormal(a, status);
4040 aSig = extractFloat64Frac( a );
4041 aExp = extractFloat64Exp( a );
4042 aSign = extractFloat64Sign( a );
4043 if ( aExp == 0x7FF ) {
4045 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4047 return packFloatx80(aSign,
4048 floatx80_infinity_high,
4049 floatx80_infinity_low);
4052 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4053 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4057 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4061 /*----------------------------------------------------------------------------
4062 | Returns the result of converting the double-precision floating-point value
4063 | `a' to the quadruple-precision floating-point format. The conversion is
4064 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4066 *----------------------------------------------------------------------------*/
4068 float128 float64_to_float128(float64 a, float_status *status)
4072 uint64_t aSig, zSig0, zSig1;
4074 a = float64_squash_input_denormal(a, status);
4075 aSig = extractFloat64Frac( a );
4076 aExp = extractFloat64Exp( a );
4077 aSign = extractFloat64Sign( a );
4078 if ( aExp == 0x7FF ) {
4080 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4082 return packFloat128( aSign, 0x7FFF, 0, 0 );
4085 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4086 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4089 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4090 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4095 /*----------------------------------------------------------------------------
4096 | Returns the remainder of the double-precision floating-point value `a'
4097 | with respect to the corresponding value `b'. The operation is performed
4098 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4099 *----------------------------------------------------------------------------*/
4101 float64 float64_rem(float64 a, float64 b, float_status *status)
4104 int aExp, bExp, expDiff;
4105 uint64_t aSig, bSig;
4106 uint64_t q, alternateASig;
4109 a = float64_squash_input_denormal(a, status);
4110 b = float64_squash_input_denormal(b, status);
4111 aSig = extractFloat64Frac( a );
4112 aExp = extractFloat64Exp( a );
4113 aSign = extractFloat64Sign( a );
4114 bSig = extractFloat64Frac( b );
4115 bExp = extractFloat64Exp( b );
4116 if ( aExp == 0x7FF ) {
4117 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4118 return propagateFloat64NaN(a, b, status);
4120 float_raise(float_flag_invalid, status);
4121 return float64_default_nan(status);
4123 if ( bExp == 0x7FF ) {
4125 return propagateFloat64NaN(a, b, status);
4131 float_raise(float_flag_invalid, status);
4132 return float64_default_nan(status);
4134 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4137 if ( aSig == 0 ) return a;
4138 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140 expDiff = aExp - bExp;
4141 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4142 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4143 if ( expDiff < 0 ) {
4144 if ( expDiff < -1 ) return a;
4147 q = ( bSig <= aSig );
4148 if ( q ) aSig -= bSig;
4150 while ( 0 < expDiff ) {
4151 q = estimateDiv128To64( aSig, 0, bSig );
4152 q = ( 2 < q ) ? q - 2 : 0;
4153 aSig = - ( ( bSig>>2 ) * q );
4157 if ( 0 < expDiff ) {
4158 q = estimateDiv128To64( aSig, 0, bSig );
4159 q = ( 2 < q ) ? q - 2 : 0;
4162 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4169 alternateASig = aSig;
4172 } while ( 0 <= (int64_t) aSig );
4173 sigMean = aSig + alternateASig;
4174 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4175 aSig = alternateASig;
4177 zSign = ( (int64_t) aSig < 0 );
4178 if ( zSign ) aSig = - aSig;
4179 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4183 /*----------------------------------------------------------------------------
4184 | Returns the binary log of the double-precision floating-point value `a'.
4185 | The operation is performed according to the IEC/IEEE Standard for Binary
4186 | Floating-Point Arithmetic.
4187 *----------------------------------------------------------------------------*/
4188 float64 float64_log2(float64 a, float_status *status)
4192 uint64_t aSig, aSig0, aSig1, zSig, i;
4193 a = float64_squash_input_denormal(a, status);
4195 aSig = extractFloat64Frac( a );
4196 aExp = extractFloat64Exp( a );
4197 aSign = extractFloat64Sign( a );
4200 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4201 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4204 float_raise(float_flag_invalid, status);
4205 return float64_default_nan(status);
4207 if ( aExp == 0x7FF ) {
4209 return propagateFloat64NaN(a, float64_zero, status);
4215 aSig |= LIT64( 0x0010000000000000 );
4217 zSig = (uint64_t)aExp << 52;
4218 for (i = 1LL << 51; i > 0; i >>= 1) {
4219 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4220 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4221 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4229 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4232 /*----------------------------------------------------------------------------
4233 | Returns 1 if the double-precision floating-point value `a' is equal to the
4234 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4235 | if either operand is a NaN. Otherwise, the comparison is performed
4236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4237 *----------------------------------------------------------------------------*/
4239 int float64_eq(float64 a, float64 b, float_status *status)
4242 a = float64_squash_input_denormal(a, status);
4243 b = float64_squash_input_denormal(b, status);
4245 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4246 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4248 float_raise(float_flag_invalid, status);
4251 av = float64_val(a);
4252 bv = float64_val(b);
4253 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4257 /*----------------------------------------------------------------------------
4258 | Returns 1 if the double-precision floating-point value `a' is less than or
4259 | equal to the corresponding value `b', and 0 otherwise. The invalid
4260 | exception is raised if either operand is a NaN. The comparison is performed
4261 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4262 *----------------------------------------------------------------------------*/
4264 int float64_le(float64 a, float64 b, float_status *status)
4268 a = float64_squash_input_denormal(a, status);
4269 b = float64_squash_input_denormal(b, status);
4271 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4272 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4274 float_raise(float_flag_invalid, status);
4277 aSign = extractFloat64Sign( a );
4278 bSign = extractFloat64Sign( b );
4279 av = float64_val(a);
4280 bv = float64_val(b);
4281 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4282 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4286 /*----------------------------------------------------------------------------
4287 | Returns 1 if the double-precision floating-point value `a' is less than
4288 | the corresponding value `b', and 0 otherwise. The invalid exception is
4289 | raised if either operand is a NaN. The comparison is performed according
4290 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4291 *----------------------------------------------------------------------------*/
4293 int float64_lt(float64 a, float64 b, float_status *status)
4298 a = float64_squash_input_denormal(a, status);
4299 b = float64_squash_input_denormal(b, status);
4300 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4301 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4303 float_raise(float_flag_invalid, status);
4306 aSign = extractFloat64Sign( a );
4307 bSign = extractFloat64Sign( b );
4308 av = float64_val(a);
4309 bv = float64_val(b);
4310 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4311 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4315 /*----------------------------------------------------------------------------
4316 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4317 | be compared, and 0 otherwise. The invalid exception is raised if either
4318 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4319 | Standard for Binary Floating-Point Arithmetic.
4320 *----------------------------------------------------------------------------*/
4322 int float64_unordered(float64 a, float64 b, float_status *status)
4324 a = float64_squash_input_denormal(a, status);
4325 b = float64_squash_input_denormal(b, status);
4327 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4328 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4330 float_raise(float_flag_invalid, status);
4336 /*----------------------------------------------------------------------------
4337 | Returns 1 if the double-precision floating-point value `a' is equal to the
4338 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4339 | exception.The comparison is performed according to the IEC/IEEE Standard
4340 | for Binary Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4343 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4346 a = float64_squash_input_denormal(a, status);
4347 b = float64_squash_input_denormal(b, status);
4349 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4350 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4352 if (float64_is_signaling_nan(a, status)
4353 || float64_is_signaling_nan(b, status)) {
4354 float_raise(float_flag_invalid, status);
4358 av = float64_val(a);
4359 bv = float64_val(b);
4360 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4364 /*----------------------------------------------------------------------------
4365 | Returns 1 if the double-precision floating-point value `a' is less than or
4366 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4367 | cause an exception. Otherwise, the comparison is performed according to the
4368 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4369 *----------------------------------------------------------------------------*/
4371 int float64_le_quiet(float64 a, float64 b, float_status *status)
4375 a = float64_squash_input_denormal(a, status);
4376 b = float64_squash_input_denormal(b, status);
4378 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4379 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4381 if (float64_is_signaling_nan(a, status)
4382 || float64_is_signaling_nan(b, status)) {
4383 float_raise(float_flag_invalid, status);
4387 aSign = extractFloat64Sign( a );
4388 bSign = extractFloat64Sign( b );
4389 av = float64_val(a);
4390 bv = float64_val(b);
4391 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4392 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4396 /*----------------------------------------------------------------------------
4397 | Returns 1 if the double-precision floating-point value `a' is less than
4398 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4399 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4400 | Standard for Binary Floating-Point Arithmetic.
4401 *----------------------------------------------------------------------------*/
4403 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4407 a = float64_squash_input_denormal(a, status);
4408 b = float64_squash_input_denormal(b, status);
4410 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4411 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4413 if (float64_is_signaling_nan(a, status)
4414 || float64_is_signaling_nan(b, status)) {
4415 float_raise(float_flag_invalid, status);
4419 aSign = extractFloat64Sign( a );
4420 bSign = extractFloat64Sign( b );
4421 av = float64_val(a);
4422 bv = float64_val(b);
4423 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4424 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4428 /*----------------------------------------------------------------------------
4429 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4430 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4431 | comparison is performed according to the IEC/IEEE Standard for Binary
4432 | Floating-Point Arithmetic.
4433 *----------------------------------------------------------------------------*/
4435 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4437 a = float64_squash_input_denormal(a, status);
4438 b = float64_squash_input_denormal(b, status);
4440 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4441 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4443 if (float64_is_signaling_nan(a, status)
4444 || float64_is_signaling_nan(b, status)) {
4445 float_raise(float_flag_invalid, status);
4452 /*----------------------------------------------------------------------------
4453 | Returns the result of converting the extended double-precision floating-
4454 | point value `a' to the 32-bit two's complement integer format. The
4455 | conversion is performed according to the IEC/IEEE Standard for Binary
4456 | Floating-Point Arithmetic---which means in particular that the conversion
4457 | is rounded according to the current rounding mode. If `a' is a NaN, the
4458 | largest positive integer is returned. Otherwise, if the conversion
4459 | overflows, the largest integer with the same sign as `a' is returned.
4460 *----------------------------------------------------------------------------*/
4462 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4465 int32_t aExp, shiftCount;
4468 if (floatx80_invalid_encoding(a)) {
4469 float_raise(float_flag_invalid, status);
4472 aSig = extractFloatx80Frac( a );
4473 aExp = extractFloatx80Exp( a );
4474 aSign = extractFloatx80Sign( a );
4475 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4476 shiftCount = 0x4037 - aExp;
4477 if ( shiftCount <= 0 ) shiftCount = 1;
4478 shift64RightJamming( aSig, shiftCount, &aSig );
4479 return roundAndPackInt32(aSign, aSig, status);
4483 /*----------------------------------------------------------------------------
4484 | Returns the result of converting the extended double-precision floating-
4485 | point value `a' to the 32-bit two's complement integer format. The
4486 | conversion is performed according to the IEC/IEEE Standard for Binary
4487 | Floating-Point Arithmetic, except that the conversion is always rounded
4488 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4489 | Otherwise, if the conversion overflows, the largest integer with the same
4490 | sign as `a' is returned.
4491 *----------------------------------------------------------------------------*/
4493 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4496 int32_t aExp, shiftCount;
4497 uint64_t aSig, savedASig;
4500 if (floatx80_invalid_encoding(a)) {
4501 float_raise(float_flag_invalid, status);
4504 aSig = extractFloatx80Frac( a );
4505 aExp = extractFloatx80Exp( a );
4506 aSign = extractFloatx80Sign( a );
4507 if ( 0x401E < aExp ) {
4508 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4511 else if ( aExp < 0x3FFF ) {
4513 status->float_exception_flags |= float_flag_inexact;
4517 shiftCount = 0x403E - aExp;
4519 aSig >>= shiftCount;
4521 if ( aSign ) z = - z;
4522 if ( ( z < 0 ) ^ aSign ) {
4524 float_raise(float_flag_invalid, status);
4525 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4527 if ( ( aSig<<shiftCount ) != savedASig ) {
4528 status->float_exception_flags |= float_flag_inexact;
4534 /*----------------------------------------------------------------------------
4535 | Returns the result of converting the extended double-precision floating-
4536 | point value `a' to the 64-bit two's complement integer format. The
4537 | conversion is performed according to the IEC/IEEE Standard for Binary
4538 | Floating-Point Arithmetic---which means in particular that the conversion
4539 | is rounded according to the current rounding mode. If `a' is a NaN,
4540 | the largest positive integer is returned. Otherwise, if the conversion
4541 | overflows, the largest integer with the same sign as `a' is returned.
4542 *----------------------------------------------------------------------------*/
4544 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4547 int32_t aExp, shiftCount;
4548 uint64_t aSig, aSigExtra;
4550 if (floatx80_invalid_encoding(a)) {
4551 float_raise(float_flag_invalid, status);
4554 aSig = extractFloatx80Frac( a );
4555 aExp = extractFloatx80Exp( a );
4556 aSign = extractFloatx80Sign( a );
4557 shiftCount = 0x403E - aExp;
4558 if ( shiftCount <= 0 ) {
4560 float_raise(float_flag_invalid, status);
4561 if (!aSign || floatx80_is_any_nan(a)) {
4562 return LIT64( 0x7FFFFFFFFFFFFFFF );
4564 return (int64_t) LIT64( 0x8000000000000000 );
4569 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4571 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4575 /*----------------------------------------------------------------------------
4576 | Returns the result of converting the extended double-precision floating-
4577 | point value `a' to the 64-bit two's complement integer format. The
4578 | conversion is performed according to the IEC/IEEE Standard for Binary
4579 | Floating-Point Arithmetic, except that the conversion is always rounded
4580 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4581 | Otherwise, if the conversion overflows, the largest integer with the same
4582 | sign as `a' is returned.
4583 *----------------------------------------------------------------------------*/
4585 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4588 int32_t aExp, shiftCount;
4592 if (floatx80_invalid_encoding(a)) {
4593 float_raise(float_flag_invalid, status);
4596 aSig = extractFloatx80Frac( a );
4597 aExp = extractFloatx80Exp( a );
4598 aSign = extractFloatx80Sign( a );
4599 shiftCount = aExp - 0x403E;
4600 if ( 0 <= shiftCount ) {
4601 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4602 if ( ( a.high != 0xC03E ) || aSig ) {
4603 float_raise(float_flag_invalid, status);
4604 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4605 return LIT64( 0x7FFFFFFFFFFFFFFF );
4608 return (int64_t) LIT64( 0x8000000000000000 );
4610 else if ( aExp < 0x3FFF ) {
4612 status->float_exception_flags |= float_flag_inexact;
4616 z = aSig>>( - shiftCount );
4617 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4618 status->float_exception_flags |= float_flag_inexact;
4620 if ( aSign ) z = - z;
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of converting the extended double-precision floating-
4627 | point value `a' to the single-precision floating-point format. The
4628 | conversion is performed according to the IEC/IEEE Standard for Binary
4629 | Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4632 float32 floatx80_to_float32(floatx80 a, float_status *status)
4638 if (floatx80_invalid_encoding(a)) {
4639 float_raise(float_flag_invalid, status);
4640 return float32_default_nan(status);
4642 aSig = extractFloatx80Frac( a );
4643 aExp = extractFloatx80Exp( a );
4644 aSign = extractFloatx80Sign( a );
4645 if ( aExp == 0x7FFF ) {
4646 if ( (uint64_t) ( aSig<<1 ) ) {
4647 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4649 return packFloat32( aSign, 0xFF, 0 );
4651 shift64RightJamming( aSig, 33, &aSig );
4652 if ( aExp || aSig ) aExp -= 0x3F81;
4653 return roundAndPackFloat32(aSign, aExp, aSig, status);
4657 /*----------------------------------------------------------------------------
4658 | Returns the result of converting the extended double-precision floating-
4659 | point value `a' to the double-precision floating-point format. The
4660 | conversion is performed according to the IEC/IEEE Standard for Binary
4661 | Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4664 float64 floatx80_to_float64(floatx80 a, float_status *status)
4668 uint64_t aSig, zSig;
4670 if (floatx80_invalid_encoding(a)) {
4671 float_raise(float_flag_invalid, status);
4672 return float64_default_nan(status);
4674 aSig = extractFloatx80Frac( a );
4675 aExp = extractFloatx80Exp( a );
4676 aSign = extractFloatx80Sign( a );
4677 if ( aExp == 0x7FFF ) {
4678 if ( (uint64_t) ( aSig<<1 ) ) {
4679 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4681 return packFloat64( aSign, 0x7FF, 0 );
4683 shift64RightJamming( aSig, 1, &zSig );
4684 if ( aExp || aSig ) aExp -= 0x3C01;
4685 return roundAndPackFloat64(aSign, aExp, zSig, status);
4689 /*----------------------------------------------------------------------------
4690 | Returns the result of converting the extended double-precision floating-
4691 | point value `a' to the quadruple-precision floating-point format. The
4692 | conversion is performed according to the IEC/IEEE Standard for Binary
4693 | Floating-Point Arithmetic.
4694 *----------------------------------------------------------------------------*/
4696 float128 floatx80_to_float128(floatx80 a, float_status *status)
4700 uint64_t aSig, zSig0, zSig1;
4702 if (floatx80_invalid_encoding(a)) {
4703 float_raise(float_flag_invalid, status);
4704 return float128_default_nan(status);
4706 aSig = extractFloatx80Frac( a );
4707 aExp = extractFloatx80Exp( a );
4708 aSign = extractFloatx80Sign( a );
4709 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4710 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4712 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4713 return packFloat128( aSign, aExp, zSig0, zSig1 );
4717 /*----------------------------------------------------------------------------
4718 | Rounds the extended double-precision floating-point value `a'
4719 | to the precision provided by floatx80_rounding_precision and returns the
4720 | result as an extended double-precision floating-point value.
4721 | The operation is performed according to the IEC/IEEE Standard for Binary
4722 | Floating-Point Arithmetic.
4723 *----------------------------------------------------------------------------*/
4725 floatx80 floatx80_round(floatx80 a, float_status *status)
4727 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4728 extractFloatx80Sign(a),
4729 extractFloatx80Exp(a),
4730 extractFloatx80Frac(a), 0, status);
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a' to an integer,
4735 | and returns the result as an extended quadruple-precision floating-point
4736 | value. The operation is performed according to the IEC/IEEE Standard for
4737 | Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4740 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4744 uint64_t lastBitMask, roundBitsMask;
4747 if (floatx80_invalid_encoding(a)) {
4748 float_raise(float_flag_invalid, status);
4749 return floatx80_default_nan(status);
4751 aExp = extractFloatx80Exp( a );
4752 if ( 0x403E <= aExp ) {
4753 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4754 return propagateFloatx80NaN(a, a, status);
4758 if ( aExp < 0x3FFF ) {
4760 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4763 status->float_exception_flags |= float_flag_inexact;
4764 aSign = extractFloatx80Sign( a );
4765 switch (status->float_rounding_mode) {
4766 case float_round_nearest_even:
4767 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4770 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4773 case float_round_ties_away:
4774 if (aExp == 0x3FFE) {
4775 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4778 case float_round_down:
4781 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4782 : packFloatx80( 0, 0, 0 );
4783 case float_round_up:
4785 aSign ? packFloatx80( 1, 0, 0 )
4786 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4788 return packFloatx80( aSign, 0, 0 );
4791 lastBitMask <<= 0x403E - aExp;
4792 roundBitsMask = lastBitMask - 1;
4794 switch (status->float_rounding_mode) {
4795 case float_round_nearest_even:
4796 z.low += lastBitMask>>1;
4797 if ((z.low & roundBitsMask) == 0) {
4798 z.low &= ~lastBitMask;
4801 case float_round_ties_away:
4802 z.low += lastBitMask >> 1;
4804 case float_round_to_zero:
4806 case float_round_up:
4807 if (!extractFloatx80Sign(z)) {
4808 z.low += roundBitsMask;
4811 case float_round_down:
4812 if (extractFloatx80Sign(z)) {
4813 z.low += roundBitsMask;
4819 z.low &= ~ roundBitsMask;
4822 z.low = LIT64( 0x8000000000000000 );
4824 if (z.low != a.low) {
4825 status->float_exception_flags |= float_flag_inexact;
4831 /*----------------------------------------------------------------------------
4832 | Returns the result of adding the absolute values of the extended double-
4833 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4834 | negated before being returned. `zSign' is ignored if the result is a NaN.
4835 | The addition is performed according to the IEC/IEEE Standard for Binary
4836 | Floating-Point Arithmetic.
4837 *----------------------------------------------------------------------------*/
4839 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4840 float_status *status)
4842 int32_t aExp, bExp, zExp;
4843 uint64_t aSig, bSig, zSig0, zSig1;
4846 aSig = extractFloatx80Frac( a );
4847 aExp = extractFloatx80Exp( a );
4848 bSig = extractFloatx80Frac( b );
4849 bExp = extractFloatx80Exp( b );
4850 expDiff = aExp - bExp;
4851 if ( 0 < expDiff ) {
4852 if ( aExp == 0x7FFF ) {
4853 if ((uint64_t)(aSig << 1)) {
4854 return propagateFloatx80NaN(a, b, status);
4858 if ( bExp == 0 ) --expDiff;
4859 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4862 else if ( expDiff < 0 ) {
4863 if ( bExp == 0x7FFF ) {
4864 if ((uint64_t)(bSig << 1)) {
4865 return propagateFloatx80NaN(a, b, status);
4867 return packFloatx80(zSign,
4868 floatx80_infinity_high,
4869 floatx80_infinity_low);
4871 if ( aExp == 0 ) ++expDiff;
4872 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4876 if ( aExp == 0x7FFF ) {
4877 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4878 return propagateFloatx80NaN(a, b, status);
4883 zSig0 = aSig + bSig;
4885 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4891 zSig0 = aSig + bSig;
4892 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4894 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4895 zSig0 |= LIT64( 0x8000000000000000 );
4898 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4899 zSign, zExp, zSig0, zSig1, status);
4902 /*----------------------------------------------------------------------------
4903 | Returns the result of subtracting the absolute values of the extended
4904 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4905 | difference is negated before being returned. `zSign' is ignored if the
4906 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4907 | Standard for Binary Floating-Point Arithmetic.
4908 *----------------------------------------------------------------------------*/
4910 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4911 float_status *status)
4913 int32_t aExp, bExp, zExp;
4914 uint64_t aSig, bSig, zSig0, zSig1;
4917 aSig = extractFloatx80Frac( a );
4918 aExp = extractFloatx80Exp( a );
4919 bSig = extractFloatx80Frac( b );
4920 bExp = extractFloatx80Exp( b );
4921 expDiff = aExp - bExp;
4922 if ( 0 < expDiff ) goto aExpBigger;
4923 if ( expDiff < 0 ) goto bExpBigger;
4924 if ( aExp == 0x7FFF ) {
4925 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4926 return propagateFloatx80NaN(a, b, status);
4928 float_raise(float_flag_invalid, status);
4929 return floatx80_default_nan(status);
4936 if ( bSig < aSig ) goto aBigger;
4937 if ( aSig < bSig ) goto bBigger;
4938 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4940 if ( bExp == 0x7FFF ) {
4941 if ((uint64_t)(bSig << 1)) {
4942 return propagateFloatx80NaN(a, b, status);
4944 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4945 floatx80_infinity_low);
4947 if ( aExp == 0 ) ++expDiff;
4948 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4950 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4953 goto normalizeRoundAndPack;
4955 if ( aExp == 0x7FFF ) {
4956 if ((uint64_t)(aSig << 1)) {
4957 return propagateFloatx80NaN(a, b, status);
4961 if ( bExp == 0 ) --expDiff;
4962 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4964 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4966 normalizeRoundAndPack:
4967 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4968 zSign, zExp, zSig0, zSig1, status);
4971 /*----------------------------------------------------------------------------
4972 | Returns the result of adding the extended double-precision floating-point
4973 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4974 | Standard for Binary Floating-Point Arithmetic.
4975 *----------------------------------------------------------------------------*/
4977 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4981 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4982 float_raise(float_flag_invalid, status);
4983 return floatx80_default_nan(status);
4985 aSign = extractFloatx80Sign( a );
4986 bSign = extractFloatx80Sign( b );
4987 if ( aSign == bSign ) {
4988 return addFloatx80Sigs(a, b, aSign, status);
4991 return subFloatx80Sigs(a, b, aSign, status);
4996 /*----------------------------------------------------------------------------
4997 | Returns the result of subtracting the extended double-precision floating-
4998 | point values `a' and `b'. The operation is performed according to the
4999 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5000 *----------------------------------------------------------------------------*/
5002 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5006 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5007 float_raise(float_flag_invalid, status);
5008 return floatx80_default_nan(status);
5010 aSign = extractFloatx80Sign( a );
5011 bSign = extractFloatx80Sign( b );
5012 if ( aSign == bSign ) {
5013 return subFloatx80Sigs(a, b, aSign, status);
5016 return addFloatx80Sigs(a, b, aSign, status);
5021 /*----------------------------------------------------------------------------
5022 | Returns the result of multiplying the extended double-precision floating-
5023 | point values `a' and `b'. The operation is performed according to the
5024 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025 *----------------------------------------------------------------------------*/
5027 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5029 flag aSign, bSign, zSign;
5030 int32_t aExp, bExp, zExp;
5031 uint64_t aSig, bSig, zSig0, zSig1;
5033 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5034 float_raise(float_flag_invalid, status);
5035 return floatx80_default_nan(status);
5037 aSig = extractFloatx80Frac( a );
5038 aExp = extractFloatx80Exp( a );
5039 aSign = extractFloatx80Sign( a );
5040 bSig = extractFloatx80Frac( b );
5041 bExp = extractFloatx80Exp( b );
5042 bSign = extractFloatx80Sign( b );
5043 zSign = aSign ^ bSign;
5044 if ( aExp == 0x7FFF ) {
5045 if ( (uint64_t) ( aSig<<1 )
5046 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5047 return propagateFloatx80NaN(a, b, status);
5049 if ( ( bExp | bSig ) == 0 ) goto invalid;
5050 return packFloatx80(zSign, floatx80_infinity_high,
5051 floatx80_infinity_low);
5053 if ( bExp == 0x7FFF ) {
5054 if ((uint64_t)(bSig << 1)) {
5055 return propagateFloatx80NaN(a, b, status);
5057 if ( ( aExp | aSig ) == 0 ) {
5059 float_raise(float_flag_invalid, status);
5060 return floatx80_default_nan(status);
5062 return packFloatx80(zSign, floatx80_infinity_high,
5063 floatx80_infinity_low);
5066 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5067 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5070 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5071 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5073 zExp = aExp + bExp - 0x3FFE;
5074 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5075 if ( 0 < (int64_t) zSig0 ) {
5076 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5079 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5080 zSign, zExp, zSig0, zSig1, status);
5083 /*----------------------------------------------------------------------------
5084 | Returns the result of dividing the extended double-precision floating-point
5085 | value `a' by the corresponding value `b'. The operation is performed
5086 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5087 *----------------------------------------------------------------------------*/
5089 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5091 flag aSign, bSign, zSign;
5092 int32_t aExp, bExp, zExp;
5093 uint64_t aSig, bSig, zSig0, zSig1;
5094 uint64_t rem0, rem1, rem2, term0, term1, term2;
5096 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5097 float_raise(float_flag_invalid, status);
5098 return floatx80_default_nan(status);
5100 aSig = extractFloatx80Frac( a );
5101 aExp = extractFloatx80Exp( a );
5102 aSign = extractFloatx80Sign( a );
5103 bSig = extractFloatx80Frac( b );
5104 bExp = extractFloatx80Exp( b );
5105 bSign = extractFloatx80Sign( b );
5106 zSign = aSign ^ bSign;
5107 if ( aExp == 0x7FFF ) {
5108 if ((uint64_t)(aSig << 1)) {
5109 return propagateFloatx80NaN(a, b, status);
5111 if ( bExp == 0x7FFF ) {
5112 if ((uint64_t)(bSig << 1)) {
5113 return propagateFloatx80NaN(a, b, status);
5117 return packFloatx80(zSign, floatx80_infinity_high,
5118 floatx80_infinity_low);
5120 if ( bExp == 0x7FFF ) {
5121 if ((uint64_t)(bSig << 1)) {
5122 return propagateFloatx80NaN(a, b, status);
5124 return packFloatx80( zSign, 0, 0 );
5128 if ( ( aExp | aSig ) == 0 ) {
5130 float_raise(float_flag_invalid, status);
5131 return floatx80_default_nan(status);
5133 float_raise(float_flag_divbyzero, status);
5134 return packFloatx80(zSign, floatx80_infinity_high,
5135 floatx80_infinity_low);
5137 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5140 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5141 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5143 zExp = aExp - bExp + 0x3FFE;
5145 if ( bSig <= aSig ) {
5146 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5149 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5150 mul64To128( bSig, zSig0, &term0, &term1 );
5151 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5152 while ( (int64_t) rem0 < 0 ) {
5154 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5156 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5157 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5158 mul64To128( bSig, zSig1, &term1, &term2 );
5159 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5160 while ( (int64_t) rem1 < 0 ) {
5162 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5164 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5166 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5167 zSign, zExp, zSig0, zSig1, status);
5170 /*----------------------------------------------------------------------------
5171 | Returns the remainder of the extended double-precision floating-point value
5172 | `a' with respect to the corresponding value `b'. The operation is performed
5173 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5174 *----------------------------------------------------------------------------*/
5176 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5179 int32_t aExp, bExp, expDiff;
5180 uint64_t aSig0, aSig1, bSig;
5181 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5183 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5184 float_raise(float_flag_invalid, status);
5185 return floatx80_default_nan(status);
5187 aSig0 = extractFloatx80Frac( a );
5188 aExp = extractFloatx80Exp( a );
5189 aSign = extractFloatx80Sign( a );
5190 bSig = extractFloatx80Frac( b );
5191 bExp = extractFloatx80Exp( b );
5192 if ( aExp == 0x7FFF ) {
5193 if ( (uint64_t) ( aSig0<<1 )
5194 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5195 return propagateFloatx80NaN(a, b, status);
5199 if ( bExp == 0x7FFF ) {
5200 if ((uint64_t)(bSig << 1)) {
5201 return propagateFloatx80NaN(a, b, status);
5208 float_raise(float_flag_invalid, status);
5209 return floatx80_default_nan(status);
5211 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5214 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5215 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5217 bSig |= LIT64( 0x8000000000000000 );
5219 expDiff = aExp - bExp;
5221 if ( expDiff < 0 ) {
5222 if ( expDiff < -1 ) return a;
5223 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5226 q = ( bSig <= aSig0 );
5227 if ( q ) aSig0 -= bSig;
5229 while ( 0 < expDiff ) {
5230 q = estimateDiv128To64( aSig0, aSig1, bSig );
5231 q = ( 2 < q ) ? q - 2 : 0;
5232 mul64To128( bSig, q, &term0, &term1 );
5233 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5234 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5238 if ( 0 < expDiff ) {
5239 q = estimateDiv128To64( aSig0, aSig1, bSig );
5240 q = ( 2 < q ) ? q - 2 : 0;
5242 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5243 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5244 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5245 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5247 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5254 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5255 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5256 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5259 aSig0 = alternateASig0;
5260 aSig1 = alternateASig1;
5264 normalizeRoundAndPackFloatx80(
5265 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5269 /*----------------------------------------------------------------------------
5270 | Returns the square root of the extended double-precision floating-point
5271 | value `a'. The operation is performed according to the IEC/IEEE Standard
5272 | for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5275 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5279 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5280 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5282 if (floatx80_invalid_encoding(a)) {
5283 float_raise(float_flag_invalid, status);
5284 return floatx80_default_nan(status);
5286 aSig0 = extractFloatx80Frac( a );
5287 aExp = extractFloatx80Exp( a );
5288 aSign = extractFloatx80Sign( a );
5289 if ( aExp == 0x7FFF ) {
5290 if ((uint64_t)(aSig0 << 1)) {
5291 return propagateFloatx80NaN(a, a, status);
5293 if ( ! aSign ) return a;
5297 if ( ( aExp | aSig0 ) == 0 ) return a;
5299 float_raise(float_flag_invalid, status);
5300 return floatx80_default_nan(status);
5303 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5304 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5306 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5307 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5308 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5309 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5310 doubleZSig0 = zSig0<<1;
5311 mul64To128( zSig0, zSig0, &term0, &term1 );
5312 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5313 while ( (int64_t) rem0 < 0 ) {
5316 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5318 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5319 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5320 if ( zSig1 == 0 ) zSig1 = 1;
5321 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5322 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5323 mul64To128( zSig1, zSig1, &term2, &term3 );
5324 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5325 while ( (int64_t) rem1 < 0 ) {
5327 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5329 term2 |= doubleZSig0;
5330 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5332 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5334 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5335 zSig0 |= doubleZSig0;
5336 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5337 0, zExp, zSig0, zSig1, status);
5340 /*----------------------------------------------------------------------------
5341 | Returns 1 if the extended double-precision floating-point value `a' is equal
5342 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5343 | raised if either operand is a NaN. Otherwise, the comparison is performed
5344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5345 *----------------------------------------------------------------------------*/
5347 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5350 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5351 || (extractFloatx80Exp(a) == 0x7FFF
5352 && (uint64_t) (extractFloatx80Frac(a) << 1))
5353 || (extractFloatx80Exp(b) == 0x7FFF
5354 && (uint64_t) (extractFloatx80Frac(b) << 1))
5356 float_raise(float_flag_invalid, status);
5361 && ( ( a.high == b.high )
5363 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5368 /*----------------------------------------------------------------------------
5369 | Returns 1 if the extended double-precision floating-point value `a' is
5370 | less than or equal to the corresponding value `b', and 0 otherwise. The
5371 | invalid exception is raised if either operand is a NaN. The comparison is
5372 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5374 *----------------------------------------------------------------------------*/
5376 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5380 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5381 || (extractFloatx80Exp(a) == 0x7FFF
5382 && (uint64_t) (extractFloatx80Frac(a) << 1))
5383 || (extractFloatx80Exp(b) == 0x7FFF
5384 && (uint64_t) (extractFloatx80Frac(b) << 1))
5386 float_raise(float_flag_invalid, status);
5389 aSign = extractFloatx80Sign( a );
5390 bSign = extractFloatx80Sign( b );
5391 if ( aSign != bSign ) {
5394 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5398 aSign ? le128( b.high, b.low, a.high, a.low )
5399 : le128( a.high, a.low, b.high, b.low );
5403 /*----------------------------------------------------------------------------
5404 | Returns 1 if the extended double-precision floating-point value `a' is
5405 | less than the corresponding value `b', and 0 otherwise. The invalid
5406 | exception is raised if either operand is a NaN. The comparison is performed
5407 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5410 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5414 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5415 || (extractFloatx80Exp(a) == 0x7FFF
5416 && (uint64_t) (extractFloatx80Frac(a) << 1))
5417 || (extractFloatx80Exp(b) == 0x7FFF
5418 && (uint64_t) (extractFloatx80Frac(b) << 1))
5420 float_raise(float_flag_invalid, status);
5423 aSign = extractFloatx80Sign( a );
5424 bSign = extractFloatx80Sign( b );
5425 if ( aSign != bSign ) {
5428 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5432 aSign ? lt128( b.high, b.low, a.high, a.low )
5433 : lt128( a.high, a.low, b.high, b.low );
5437 /*----------------------------------------------------------------------------
5438 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5439 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5440 | either operand is a NaN. The comparison is performed according to the
5441 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5442 *----------------------------------------------------------------------------*/
5443 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5445 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5446 || (extractFloatx80Exp(a) == 0x7FFF
5447 && (uint64_t) (extractFloatx80Frac(a) << 1))
5448 || (extractFloatx80Exp(b) == 0x7FFF
5449 && (uint64_t) (extractFloatx80Frac(b) << 1))
5451 float_raise(float_flag_invalid, status);
5457 /*----------------------------------------------------------------------------
5458 | Returns 1 if the extended double-precision floating-point value `a' is
5459 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5460 | cause an exception. The comparison is performed according to the IEC/IEEE
5461 | Standard for Binary Floating-Point Arithmetic.
5462 *----------------------------------------------------------------------------*/
5464 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5467 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5468 float_raise(float_flag_invalid, status);
5471 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5472 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5473 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5474 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5476 if (floatx80_is_signaling_nan(a, status)
5477 || floatx80_is_signaling_nan(b, status)) {
5478 float_raise(float_flag_invalid, status);
5484 && ( ( a.high == b.high )
5486 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5491 /*----------------------------------------------------------------------------
5492 | Returns 1 if the extended double-precision floating-point value `a' is less
5493 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5494 | do not cause an exception. Otherwise, the comparison is performed according
5495 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5496 *----------------------------------------------------------------------------*/
5498 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5502 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5503 float_raise(float_flag_invalid, status);
5506 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5507 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5508 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5509 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5511 if (floatx80_is_signaling_nan(a, status)
5512 || floatx80_is_signaling_nan(b, status)) {
5513 float_raise(float_flag_invalid, status);
5517 aSign = extractFloatx80Sign( a );
5518 bSign = extractFloatx80Sign( b );
5519 if ( aSign != bSign ) {
5522 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5526 aSign ? le128( b.high, b.low, a.high, a.low )
5527 : le128( a.high, a.low, b.high, b.low );
5531 /*----------------------------------------------------------------------------
5532 | Returns 1 if the extended double-precision floating-point value `a' is less
5533 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5534 | an exception. Otherwise, the comparison is performed according to the
5535 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5536 *----------------------------------------------------------------------------*/
5538 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5542 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5543 float_raise(float_flag_invalid, status);
5546 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5547 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5548 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5549 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5551 if (floatx80_is_signaling_nan(a, status)
5552 || floatx80_is_signaling_nan(b, status)) {
5553 float_raise(float_flag_invalid, status);
5557 aSign = extractFloatx80Sign( a );
5558 bSign = extractFloatx80Sign( b );
5559 if ( aSign != bSign ) {
5562 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5566 aSign ? lt128( b.high, b.low, a.high, a.low )
5567 : lt128( a.high, a.low, b.high, b.low );
5571 /*----------------------------------------------------------------------------
5572 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5573 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5574 | The comparison is performed according to the IEC/IEEE Standard for Binary
5575 | Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5577 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5579 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5580 float_raise(float_flag_invalid, status);
5583 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5584 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5585 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5586 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5588 if (floatx80_is_signaling_nan(a, status)
5589 || floatx80_is_signaling_nan(b, status)) {
5590 float_raise(float_flag_invalid, status);
5597 /*----------------------------------------------------------------------------
5598 | Returns the result of converting the quadruple-precision floating-point
5599 | value `a' to the 32-bit two's complement integer format. The conversion
5600 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5601 | Arithmetic---which means in particular that the conversion is rounded
5602 | according to the current rounding mode. If `a' is a NaN, the largest
5603 | positive integer is returned. Otherwise, if the conversion overflows, the
5604 | largest integer with the same sign as `a' is returned.
5605 *----------------------------------------------------------------------------*/
5607 int32_t float128_to_int32(float128 a, float_status *status)
5610 int32_t aExp, shiftCount;
5611 uint64_t aSig0, aSig1;
5613 aSig1 = extractFloat128Frac1( a );
5614 aSig0 = extractFloat128Frac0( a );
5615 aExp = extractFloat128Exp( a );
5616 aSign = extractFloat128Sign( a );
5617 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5618 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5619 aSig0 |= ( aSig1 != 0 );
5620 shiftCount = 0x4028 - aExp;
5621 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5622 return roundAndPackInt32(aSign, aSig0, status);
5626 /*----------------------------------------------------------------------------
5627 | Returns the result of converting the quadruple-precision floating-point
5628 | value `a' to the 32-bit two's complement integer format. The conversion
5629 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5630 | Arithmetic, except that the conversion is always rounded toward zero. If
5631 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5632 | conversion overflows, the largest integer with the same sign as `a' is
5634 *----------------------------------------------------------------------------*/
5636 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5639 int32_t aExp, shiftCount;
5640 uint64_t aSig0, aSig1, savedASig;
5643 aSig1 = extractFloat128Frac1( a );
5644 aSig0 = extractFloat128Frac0( a );
5645 aExp = extractFloat128Exp( a );
5646 aSign = extractFloat128Sign( a );
5647 aSig0 |= ( aSig1 != 0 );
5648 if ( 0x401E < aExp ) {
5649 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5652 else if ( aExp < 0x3FFF ) {
5653 if (aExp || aSig0) {
5654 status->float_exception_flags |= float_flag_inexact;
5658 aSig0 |= LIT64( 0x0001000000000000 );
5659 shiftCount = 0x402F - aExp;
5661 aSig0 >>= shiftCount;
5663 if ( aSign ) z = - z;
5664 if ( ( z < 0 ) ^ aSign ) {
5666 float_raise(float_flag_invalid, status);
5667 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5669 if ( ( aSig0<<shiftCount ) != savedASig ) {
5670 status->float_exception_flags |= float_flag_inexact;
5676 /*----------------------------------------------------------------------------
5677 | Returns the result of converting the quadruple-precision floating-point
5678 | value `a' to the 64-bit two's complement integer format. The conversion
5679 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5680 | Arithmetic---which means in particular that the conversion is rounded
5681 | according to the current rounding mode. If `a' is a NaN, the largest
5682 | positive integer is returned. Otherwise, if the conversion overflows, the
5683 | largest integer with the same sign as `a' is returned.
5684 *----------------------------------------------------------------------------*/
5686 int64_t float128_to_int64(float128 a, float_status *status)
5689 int32_t aExp, shiftCount;
5690 uint64_t aSig0, aSig1;
5692 aSig1 = extractFloat128Frac1( a );
5693 aSig0 = extractFloat128Frac0( a );
5694 aExp = extractFloat128Exp( a );
5695 aSign = extractFloat128Sign( a );
5696 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5697 shiftCount = 0x402F - aExp;
5698 if ( shiftCount <= 0 ) {
5699 if ( 0x403E < aExp ) {
5700 float_raise(float_flag_invalid, status);
5702 || ( ( aExp == 0x7FFF )
5703 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5706 return LIT64( 0x7FFFFFFFFFFFFFFF );
5708 return (int64_t) LIT64( 0x8000000000000000 );
5710 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5713 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5715 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5719 /*----------------------------------------------------------------------------
5720 | Returns the result of converting the quadruple-precision floating-point
5721 | value `a' to the 64-bit two's complement integer format. The conversion
5722 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5723 | Arithmetic, except that the conversion is always rounded toward zero.
5724 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5725 | the conversion overflows, the largest integer with the same sign as `a' is
5727 *----------------------------------------------------------------------------*/
5729 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5732 int32_t aExp, shiftCount;
5733 uint64_t aSig0, aSig1;
5736 aSig1 = extractFloat128Frac1( a );
5737 aSig0 = extractFloat128Frac0( a );
5738 aExp = extractFloat128Exp( a );
5739 aSign = extractFloat128Sign( a );
5740 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5741 shiftCount = aExp - 0x402F;
5742 if ( 0 < shiftCount ) {
5743 if ( 0x403E <= aExp ) {
5744 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5745 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5746 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5748 status->float_exception_flags |= float_flag_inexact;
5752 float_raise(float_flag_invalid, status);
5753 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5754 return LIT64( 0x7FFFFFFFFFFFFFFF );
5757 return (int64_t) LIT64( 0x8000000000000000 );
5759 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5760 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5761 status->float_exception_flags |= float_flag_inexact;
5765 if ( aExp < 0x3FFF ) {
5766 if ( aExp | aSig0 | aSig1 ) {
5767 status->float_exception_flags |= float_flag_inexact;
5771 z = aSig0>>( - shiftCount );
5773 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5774 status->float_exception_flags |= float_flag_inexact;
5777 if ( aSign ) z = - z;
5782 /*----------------------------------------------------------------------------
5783 | Returns the result of converting the quadruple-precision floating-point value
5784 | `a' to the 64-bit unsigned integer format. The conversion is
5785 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5786 | Arithmetic---which means in particular that the conversion is rounded
5787 | according to the current rounding mode. If `a' is a NaN, the largest
5788 | positive integer is returned. If the conversion overflows, the
5789 | largest unsigned integer is returned. If 'a' is negative, the value is
5790 | rounded and zero is returned; negative values that do not round to zero
5791 | will raise the inexact exception.
5792 *----------------------------------------------------------------------------*/
5794 uint64_t float128_to_uint64(float128 a, float_status *status)
5799 uint64_t aSig0, aSig1;
5801 aSig0 = extractFloat128Frac0(a);
5802 aSig1 = extractFloat128Frac1(a);
5803 aExp = extractFloat128Exp(a);
5804 aSign = extractFloat128Sign(a);
5805 if (aSign && (aExp > 0x3FFE)) {
5806 float_raise(float_flag_invalid, status);
5807 if (float128_is_any_nan(a)) {
5808 return LIT64(0xFFFFFFFFFFFFFFFF);
5814 aSig0 |= LIT64(0x0001000000000000);
5816 shiftCount = 0x402F - aExp;
5817 if (shiftCount <= 0) {
5818 if (0x403E < aExp) {
5819 float_raise(float_flag_invalid, status);
5820 return LIT64(0xFFFFFFFFFFFFFFFF);
5822 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5824 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5826 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5829 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5832 signed char current_rounding_mode = status->float_rounding_mode;
5834 set_float_rounding_mode(float_round_to_zero, status);
5835 v = float128_to_uint64(a, status);
5836 set_float_rounding_mode(current_rounding_mode, status);
5841 /*----------------------------------------------------------------------------
5842 | Returns the result of converting the quadruple-precision floating-point
5843 | value `a' to the 32-bit unsigned integer format. The conversion
5844 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5845 | Arithmetic except that the conversion is always rounded toward zero.
5846 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5847 | if the conversion overflows, the largest unsigned integer is returned.
5848 | If 'a' is negative, the value is rounded and zero is returned; negative
5849 | values that do not round to zero will raise the inexact exception.
5850 *----------------------------------------------------------------------------*/
5852 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5856 int old_exc_flags = get_float_exception_flags(status);
5858 v = float128_to_uint64_round_to_zero(a, status);
5859 if (v > 0xffffffff) {
5864 set_float_exception_flags(old_exc_flags, status);
5865 float_raise(float_flag_invalid, status);
5869 /*----------------------------------------------------------------------------
5870 | Returns the result of converting the quadruple-precision floating-point
5871 | value `a' to the single-precision floating-point format. The conversion
5872 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5874 *----------------------------------------------------------------------------*/
5876 float32 float128_to_float32(float128 a, float_status *status)
5880 uint64_t aSig0, aSig1;
5883 aSig1 = extractFloat128Frac1( a );
5884 aSig0 = extractFloat128Frac0( a );
5885 aExp = extractFloat128Exp( a );
5886 aSign = extractFloat128Sign( a );
5887 if ( aExp == 0x7FFF ) {
5888 if ( aSig0 | aSig1 ) {
5889 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5891 return packFloat32( aSign, 0xFF, 0 );
5893 aSig0 |= ( aSig1 != 0 );
5894 shift64RightJamming( aSig0, 18, &aSig0 );
5896 if ( aExp || zSig ) {
5900 return roundAndPackFloat32(aSign, aExp, zSig, status);
5904 /*----------------------------------------------------------------------------
5905 | Returns the result of converting the quadruple-precision floating-point
5906 | value `a' to the double-precision floating-point format. The conversion
5907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5909 *----------------------------------------------------------------------------*/
5911 float64 float128_to_float64(float128 a, float_status *status)
5915 uint64_t aSig0, aSig1;
5917 aSig1 = extractFloat128Frac1( a );
5918 aSig0 = extractFloat128Frac0( a );
5919 aExp = extractFloat128Exp( a );
5920 aSign = extractFloat128Sign( a );
5921 if ( aExp == 0x7FFF ) {
5922 if ( aSig0 | aSig1 ) {
5923 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5925 return packFloat64( aSign, 0x7FF, 0 );
5927 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5928 aSig0 |= ( aSig1 != 0 );
5929 if ( aExp || aSig0 ) {
5930 aSig0 |= LIT64( 0x4000000000000000 );
5933 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5937 /*----------------------------------------------------------------------------
5938 | Returns the result of converting the quadruple-precision floating-point
5939 | value `a' to the extended double-precision floating-point format. The
5940 | conversion is performed according to the IEC/IEEE Standard for Binary
5941 | Floating-Point Arithmetic.
5942 *----------------------------------------------------------------------------*/
5944 floatx80 float128_to_floatx80(float128 a, float_status *status)
5948 uint64_t aSig0, aSig1;
5950 aSig1 = extractFloat128Frac1( a );
5951 aSig0 = extractFloat128Frac0( a );
5952 aExp = extractFloat128Exp( a );
5953 aSign = extractFloat128Sign( a );
5954 if ( aExp == 0x7FFF ) {
5955 if ( aSig0 | aSig1 ) {
5956 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5958 return packFloatx80(aSign, floatx80_infinity_high,
5959 floatx80_infinity_low);
5962 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5963 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5966 aSig0 |= LIT64( 0x0001000000000000 );
5968 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5969 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5973 /*----------------------------------------------------------------------------
5974 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5975 | returns the result as a quadruple-precision floating-point value. The
5976 | operation is performed according to the IEC/IEEE Standard for Binary
5977 | Floating-Point Arithmetic.
5978 *----------------------------------------------------------------------------*/
5980 float128 float128_round_to_int(float128 a, float_status *status)
5984 uint64_t lastBitMask, roundBitsMask;
5987 aExp = extractFloat128Exp( a );
5988 if ( 0x402F <= aExp ) {
5989 if ( 0x406F <= aExp ) {
5990 if ( ( aExp == 0x7FFF )
5991 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5993 return propagateFloat128NaN(a, a, status);
5998 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5999 roundBitsMask = lastBitMask - 1;
6001 switch (status->float_rounding_mode) {
6002 case float_round_nearest_even:
6003 if ( lastBitMask ) {
6004 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6005 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6008 if ( (int64_t) z.low < 0 ) {
6010 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6014 case float_round_ties_away:
6016 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6018 if ((int64_t) z.low < 0) {
6023 case float_round_to_zero:
6025 case float_round_up:
6026 if (!extractFloat128Sign(z)) {
6027 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6030 case float_round_down:
6031 if (extractFloat128Sign(z)) {
6032 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6038 z.low &= ~ roundBitsMask;
6041 if ( aExp < 0x3FFF ) {
6042 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6043 status->float_exception_flags |= float_flag_inexact;
6044 aSign = extractFloat128Sign( a );
6045 switch (status->float_rounding_mode) {
6046 case float_round_nearest_even:
6047 if ( ( aExp == 0x3FFE )
6048 && ( extractFloat128Frac0( a )
6049 | extractFloat128Frac1( a ) )
6051 return packFloat128( aSign, 0x3FFF, 0, 0 );
6054 case float_round_ties_away:
6055 if (aExp == 0x3FFE) {
6056 return packFloat128(aSign, 0x3FFF, 0, 0);
6059 case float_round_down:
6061 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6062 : packFloat128( 0, 0, 0, 0 );
6063 case float_round_up:
6065 aSign ? packFloat128( 1, 0, 0, 0 )
6066 : packFloat128( 0, 0x3FFF, 0, 0 );
6068 return packFloat128( aSign, 0, 0, 0 );
6071 lastBitMask <<= 0x402F - aExp;
6072 roundBitsMask = lastBitMask - 1;
6075 switch (status->float_rounding_mode) {
6076 case float_round_nearest_even:
6077 z.high += lastBitMask>>1;
6078 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6079 z.high &= ~ lastBitMask;
6082 case float_round_ties_away:
6083 z.high += lastBitMask>>1;
6085 case float_round_to_zero:
6087 case float_round_up:
6088 if (!extractFloat128Sign(z)) {
6089 z.high |= ( a.low != 0 );
6090 z.high += roundBitsMask;
6093 case float_round_down:
6094 if (extractFloat128Sign(z)) {
6095 z.high |= (a.low != 0);
6096 z.high += roundBitsMask;
6102 z.high &= ~ roundBitsMask;
6104 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6105 status->float_exception_flags |= float_flag_inexact;
6111 /*----------------------------------------------------------------------------
6112 | Returns the result of adding the absolute values of the quadruple-precision
6113 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6114 | before being returned. `zSign' is ignored if the result is a NaN.
6115 | The addition is performed according to the IEC/IEEE Standard for Binary
6116 | Floating-Point Arithmetic.
6117 *----------------------------------------------------------------------------*/
6119 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6120 float_status *status)
6122 int32_t aExp, bExp, zExp;
6123 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6126 aSig1 = extractFloat128Frac1( a );
6127 aSig0 = extractFloat128Frac0( a );
6128 aExp = extractFloat128Exp( a );
6129 bSig1 = extractFloat128Frac1( b );
6130 bSig0 = extractFloat128Frac0( b );
6131 bExp = extractFloat128Exp( b );
6132 expDiff = aExp - bExp;
6133 if ( 0 < expDiff ) {
6134 if ( aExp == 0x7FFF ) {
6135 if (aSig0 | aSig1) {
6136 return propagateFloat128NaN(a, b, status);
6144 bSig0 |= LIT64( 0x0001000000000000 );
6146 shift128ExtraRightJamming(
6147 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6150 else if ( expDiff < 0 ) {
6151 if ( bExp == 0x7FFF ) {
6152 if (bSig0 | bSig1) {
6153 return propagateFloat128NaN(a, b, status);
6155 return packFloat128( zSign, 0x7FFF, 0, 0 );
6161 aSig0 |= LIT64( 0x0001000000000000 );
6163 shift128ExtraRightJamming(
6164 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6168 if ( aExp == 0x7FFF ) {
6169 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6170 return propagateFloat128NaN(a, b, status);
6174 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6176 if (status->flush_to_zero) {
6177 if (zSig0 | zSig1) {
6178 float_raise(float_flag_output_denormal, status);
6180 return packFloat128(zSign, 0, 0, 0);
6182 return packFloat128( zSign, 0, zSig0, zSig1 );
6185 zSig0 |= LIT64( 0x0002000000000000 );
6189 aSig0 |= LIT64( 0x0001000000000000 );
6190 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6192 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6195 shift128ExtraRightJamming(
6196 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6198 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6202 /*----------------------------------------------------------------------------
6203 | Returns the result of subtracting the absolute values of the quadruple-
6204 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6205 | difference is negated before being returned. `zSign' is ignored if the
6206 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6207 | Standard for Binary Floating-Point Arithmetic.
6208 *----------------------------------------------------------------------------*/
6210 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6211 float_status *status)
6213 int32_t aExp, bExp, zExp;
6214 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6217 aSig1 = extractFloat128Frac1( a );
6218 aSig0 = extractFloat128Frac0( a );
6219 aExp = extractFloat128Exp( a );
6220 bSig1 = extractFloat128Frac1( b );
6221 bSig0 = extractFloat128Frac0( b );
6222 bExp = extractFloat128Exp( b );
6223 expDiff = aExp - bExp;
6224 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6225 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6226 if ( 0 < expDiff ) goto aExpBigger;
6227 if ( expDiff < 0 ) goto bExpBigger;
6228 if ( aExp == 0x7FFF ) {
6229 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6230 return propagateFloat128NaN(a, b, status);
6232 float_raise(float_flag_invalid, status);
6233 return float128_default_nan(status);
6239 if ( bSig0 < aSig0 ) goto aBigger;
6240 if ( aSig0 < bSig0 ) goto bBigger;
6241 if ( bSig1 < aSig1 ) goto aBigger;
6242 if ( aSig1 < bSig1 ) goto bBigger;
6243 return packFloat128(status->float_rounding_mode == float_round_down,
6246 if ( bExp == 0x7FFF ) {
6247 if (bSig0 | bSig1) {
6248 return propagateFloat128NaN(a, b, status);
6250 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6256 aSig0 |= LIT64( 0x4000000000000000 );
6258 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6259 bSig0 |= LIT64( 0x4000000000000000 );
6261 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6264 goto normalizeRoundAndPack;
6266 if ( aExp == 0x7FFF ) {
6267 if (aSig0 | aSig1) {
6268 return propagateFloat128NaN(a, b, status);
6276 bSig0 |= LIT64( 0x4000000000000000 );
6278 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6279 aSig0 |= LIT64( 0x4000000000000000 );
6281 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6283 normalizeRoundAndPack:
6285 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6290 /*----------------------------------------------------------------------------
6291 | Returns the result of adding the quadruple-precision floating-point values
6292 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6293 | for Binary Floating-Point Arithmetic.
6294 *----------------------------------------------------------------------------*/
6296 float128 float128_add(float128 a, float128 b, float_status *status)
6300 aSign = extractFloat128Sign( a );
6301 bSign = extractFloat128Sign( b );
6302 if ( aSign == bSign ) {
6303 return addFloat128Sigs(a, b, aSign, status);
6306 return subFloat128Sigs(a, b, aSign, status);
6311 /*----------------------------------------------------------------------------
6312 | Returns the result of subtracting the quadruple-precision floating-point
6313 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6314 | Standard for Binary Floating-Point Arithmetic.
6315 *----------------------------------------------------------------------------*/
6317 float128 float128_sub(float128 a, float128 b, float_status *status)
6321 aSign = extractFloat128Sign( a );
6322 bSign = extractFloat128Sign( b );
6323 if ( aSign == bSign ) {
6324 return subFloat128Sigs(a, b, aSign, status);
6327 return addFloat128Sigs(a, b, aSign, status);
6332 /*----------------------------------------------------------------------------
6333 | Returns the result of multiplying the quadruple-precision floating-point
6334 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6335 | Standard for Binary Floating-Point Arithmetic.
6336 *----------------------------------------------------------------------------*/
6338 float128 float128_mul(float128 a, float128 b, float_status *status)
6340 flag aSign, bSign, zSign;
6341 int32_t aExp, bExp, zExp;
6342 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6344 aSig1 = extractFloat128Frac1( a );
6345 aSig0 = extractFloat128Frac0( a );
6346 aExp = extractFloat128Exp( a );
6347 aSign = extractFloat128Sign( a );
6348 bSig1 = extractFloat128Frac1( b );
6349 bSig0 = extractFloat128Frac0( b );
6350 bExp = extractFloat128Exp( b );
6351 bSign = extractFloat128Sign( b );
6352 zSign = aSign ^ bSign;
6353 if ( aExp == 0x7FFF ) {
6354 if ( ( aSig0 | aSig1 )
6355 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6356 return propagateFloat128NaN(a, b, status);
6358 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6359 return packFloat128( zSign, 0x7FFF, 0, 0 );
6361 if ( bExp == 0x7FFF ) {
6362 if (bSig0 | bSig1) {
6363 return propagateFloat128NaN(a, b, status);
6365 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6367 float_raise(float_flag_invalid, status);
6368 return float128_default_nan(status);
6370 return packFloat128( zSign, 0x7FFF, 0, 0 );
6373 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6374 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6377 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6378 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6380 zExp = aExp + bExp - 0x4000;
6381 aSig0 |= LIT64( 0x0001000000000000 );
6382 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6383 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6384 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6385 zSig2 |= ( zSig3 != 0 );
6386 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6387 shift128ExtraRightJamming(
6388 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6391 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6395 /*----------------------------------------------------------------------------
6396 | Returns the result of dividing the quadruple-precision floating-point value
6397 | `a' by the corresponding value `b'. The operation is performed according to
6398 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6399 *----------------------------------------------------------------------------*/
6401 float128 float128_div(float128 a, float128 b, float_status *status)
6403 flag aSign, bSign, zSign;
6404 int32_t aExp, bExp, zExp;
6405 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6406 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6408 aSig1 = extractFloat128Frac1( a );
6409 aSig0 = extractFloat128Frac0( a );
6410 aExp = extractFloat128Exp( a );
6411 aSign = extractFloat128Sign( a );
6412 bSig1 = extractFloat128Frac1( b );
6413 bSig0 = extractFloat128Frac0( b );
6414 bExp = extractFloat128Exp( b );
6415 bSign = extractFloat128Sign( b );
6416 zSign = aSign ^ bSign;
6417 if ( aExp == 0x7FFF ) {
6418 if (aSig0 | aSig1) {
6419 return propagateFloat128NaN(a, b, status);
6421 if ( bExp == 0x7FFF ) {
6422 if (bSig0 | bSig1) {
6423 return propagateFloat128NaN(a, b, status);
6427 return packFloat128( zSign, 0x7FFF, 0, 0 );
6429 if ( bExp == 0x7FFF ) {
6430 if (bSig0 | bSig1) {
6431 return propagateFloat128NaN(a, b, status);
6433 return packFloat128( zSign, 0, 0, 0 );
6436 if ( ( bSig0 | bSig1 ) == 0 ) {
6437 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6439 float_raise(float_flag_invalid, status);
6440 return float128_default_nan(status);
6442 float_raise(float_flag_divbyzero, status);
6443 return packFloat128( zSign, 0x7FFF, 0, 0 );
6445 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6448 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6449 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6451 zExp = aExp - bExp + 0x3FFD;
6453 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6455 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6456 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6457 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6460 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6461 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6462 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6463 while ( (int64_t) rem0 < 0 ) {
6465 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6467 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6468 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6469 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6470 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6471 while ( (int64_t) rem1 < 0 ) {
6473 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6475 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6477 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6478 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6482 /*----------------------------------------------------------------------------
6483 | Returns the remainder of the quadruple-precision floating-point value `a'
6484 | with respect to the corresponding value `b'. The operation is performed
6485 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6486 *----------------------------------------------------------------------------*/
6488 float128 float128_rem(float128 a, float128 b, float_status *status)
6491 int32_t aExp, bExp, expDiff;
6492 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6493 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6496 aSig1 = extractFloat128Frac1( a );
6497 aSig0 = extractFloat128Frac0( a );
6498 aExp = extractFloat128Exp( a );
6499 aSign = extractFloat128Sign( a );
6500 bSig1 = extractFloat128Frac1( b );
6501 bSig0 = extractFloat128Frac0( b );
6502 bExp = extractFloat128Exp( b );
6503 if ( aExp == 0x7FFF ) {
6504 if ( ( aSig0 | aSig1 )
6505 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6506 return propagateFloat128NaN(a, b, status);
6510 if ( bExp == 0x7FFF ) {
6511 if (bSig0 | bSig1) {
6512 return propagateFloat128NaN(a, b, status);
6517 if ( ( bSig0 | bSig1 ) == 0 ) {
6519 float_raise(float_flag_invalid, status);
6520 return float128_default_nan(status);
6522 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6525 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6526 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6528 expDiff = aExp - bExp;
6529 if ( expDiff < -1 ) return a;
6531 aSig0 | LIT64( 0x0001000000000000 ),
6533 15 - ( expDiff < 0 ),
6538 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6539 q = le128( bSig0, bSig1, aSig0, aSig1 );
6540 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6542 while ( 0 < expDiff ) {
6543 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6544 q = ( 4 < q ) ? q - 4 : 0;
6545 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6546 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6547 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6548 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6551 if ( -64 < expDiff ) {
6552 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6553 q = ( 4 < q ) ? q - 4 : 0;
6555 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6557 if ( expDiff < 0 ) {
6558 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6561 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6563 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6564 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6567 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6568 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6571 alternateASig0 = aSig0;
6572 alternateASig1 = aSig1;
6574 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6575 } while ( 0 <= (int64_t) aSig0 );
6577 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6578 if ( ( sigMean0 < 0 )
6579 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6580 aSig0 = alternateASig0;
6581 aSig1 = alternateASig1;
6583 zSign = ( (int64_t) aSig0 < 0 );
6584 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6585 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6589 /*----------------------------------------------------------------------------
6590 | Returns the square root of the quadruple-precision floating-point value `a'.
6591 | The operation is performed according to the IEC/IEEE Standard for Binary
6592 | Floating-Point Arithmetic.
6593 *----------------------------------------------------------------------------*/
6595 float128 float128_sqrt(float128 a, float_status *status)
6599 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6600 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6602 aSig1 = extractFloat128Frac1( a );
6603 aSig0 = extractFloat128Frac0( a );
6604 aExp = extractFloat128Exp( a );
6605 aSign = extractFloat128Sign( a );
6606 if ( aExp == 0x7FFF ) {
6607 if (aSig0 | aSig1) {
6608 return propagateFloat128NaN(a, a, status);
6610 if ( ! aSign ) return a;
6614 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6616 float_raise(float_flag_invalid, status);
6617 return float128_default_nan(status);
6620 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6621 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6623 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6624 aSig0 |= LIT64( 0x0001000000000000 );
6625 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6626 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6627 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6628 doubleZSig0 = zSig0<<1;
6629 mul64To128( zSig0, zSig0, &term0, &term1 );
6630 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6631 while ( (int64_t) rem0 < 0 ) {
6634 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6636 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6637 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6638 if ( zSig1 == 0 ) zSig1 = 1;
6639 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6640 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6641 mul64To128( zSig1, zSig1, &term2, &term3 );
6642 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6643 while ( (int64_t) rem1 < 0 ) {
6645 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6647 term2 |= doubleZSig0;
6648 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6650 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6652 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6653 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6657 /*----------------------------------------------------------------------------
6658 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6659 | the corresponding value `b', and 0 otherwise. The invalid exception is
6660 | raised if either operand is a NaN. Otherwise, the comparison is performed
6661 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6662 *----------------------------------------------------------------------------*/
6664 int float128_eq(float128 a, float128 b, float_status *status)
6667 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6668 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6669 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6670 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6672 float_raise(float_flag_invalid, status);
6677 && ( ( a.high == b.high )
6679 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6684 /*----------------------------------------------------------------------------
6685 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6686 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6687 | exception is raised if either operand is a NaN. The comparison is performed
6688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6691 int float128_le(float128 a, float128 b, float_status *status)
6695 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6696 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6697 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6698 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6700 float_raise(float_flag_invalid, status);
6703 aSign = extractFloat128Sign( a );
6704 bSign = extractFloat128Sign( b );
6705 if ( aSign != bSign ) {
6708 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6712 aSign ? le128( b.high, b.low, a.high, a.low )
6713 : le128( a.high, a.low, b.high, b.low );
6717 /*----------------------------------------------------------------------------
6718 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6719 | the corresponding value `b', and 0 otherwise. The invalid exception is
6720 | raised if either operand is a NaN. The comparison is performed according
6721 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6722 *----------------------------------------------------------------------------*/
6724 int float128_lt(float128 a, float128 b, float_status *status)
6728 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6729 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6730 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6731 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6733 float_raise(float_flag_invalid, status);
6736 aSign = extractFloat128Sign( a );
6737 bSign = extractFloat128Sign( b );
6738 if ( aSign != bSign ) {
6741 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6745 aSign ? lt128( b.high, b.low, a.high, a.low )
6746 : lt128( a.high, a.low, b.high, b.low );
6750 /*----------------------------------------------------------------------------
6751 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6752 | be compared, and 0 otherwise. The invalid exception is raised if either
6753 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6754 | Standard for Binary Floating-Point Arithmetic.
6755 *----------------------------------------------------------------------------*/
6757 int float128_unordered(float128 a, float128 b, float_status *status)
6759 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6760 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6761 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6762 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6764 float_raise(float_flag_invalid, status);
6770 /*----------------------------------------------------------------------------
6771 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6772 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6773 | exception. The comparison is performed according to the IEC/IEEE Standard
6774 | for Binary Floating-Point Arithmetic.
6775 *----------------------------------------------------------------------------*/
6777 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6780 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6781 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6782 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6783 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6785 if (float128_is_signaling_nan(a, status)
6786 || float128_is_signaling_nan(b, status)) {
6787 float_raise(float_flag_invalid, status);
6793 && ( ( a.high == b.high )
6795 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6800 /*----------------------------------------------------------------------------
6801 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6802 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6803 | cause an exception. Otherwise, the comparison is performed according to the
6804 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6805 *----------------------------------------------------------------------------*/
6807 int float128_le_quiet(float128 a, float128 b, float_status *status)
6811 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6812 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6813 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6814 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6816 if (float128_is_signaling_nan(a, status)
6817 || float128_is_signaling_nan(b, status)) {
6818 float_raise(float_flag_invalid, status);
6822 aSign = extractFloat128Sign( a );
6823 bSign = extractFloat128Sign( b );
6824 if ( aSign != bSign ) {
6827 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6831 aSign ? le128( b.high, b.low, a.high, a.low )
6832 : le128( a.high, a.low, b.high, b.low );
6836 /*----------------------------------------------------------------------------
6837 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6838 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6839 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6840 | Standard for Binary Floating-Point Arithmetic.
6841 *----------------------------------------------------------------------------*/
6843 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6847 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6848 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6849 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6850 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6852 if (float128_is_signaling_nan(a, status)
6853 || float128_is_signaling_nan(b, status)) {
6854 float_raise(float_flag_invalid, status);
6858 aSign = extractFloat128Sign( a );
6859 bSign = extractFloat128Sign( b );
6860 if ( aSign != bSign ) {
6863 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6867 aSign ? lt128( b.high, b.low, a.high, a.low )
6868 : lt128( a.high, a.low, b.high, b.low );
6872 /*----------------------------------------------------------------------------
6873 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6874 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6875 | comparison is performed according to the IEC/IEEE Standard for Binary
6876 | Floating-Point Arithmetic.
6877 *----------------------------------------------------------------------------*/
6879 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6881 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6882 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6883 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6884 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6886 if (float128_is_signaling_nan(a, status)
6887 || float128_is_signaling_nan(b, status)) {
6888 float_raise(float_flag_invalid, status);
6895 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6896 int is_quiet, float_status *status)
6900 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6901 float_raise(float_flag_invalid, status);
6902 return float_relation_unordered;
6904 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6905 ( extractFloatx80Frac( a )<<1 ) ) ||
6906 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6907 ( extractFloatx80Frac( b )<<1 ) )) {
6909 floatx80_is_signaling_nan(a, status) ||
6910 floatx80_is_signaling_nan(b, status)) {
6911 float_raise(float_flag_invalid, status);
6913 return float_relation_unordered;
6915 aSign = extractFloatx80Sign( a );
6916 bSign = extractFloatx80Sign( b );
6917 if ( aSign != bSign ) {
6919 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6920 ( ( a.low | b.low ) == 0 ) ) {
6922 return float_relation_equal;
6924 return 1 - (2 * aSign);
6927 if (a.low == b.low && a.high == b.high) {
6928 return float_relation_equal;
6930 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6935 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6937 return floatx80_compare_internal(a, b, 0, status);
6940 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6942 return floatx80_compare_internal(a, b, 1, status);
6945 static inline int float128_compare_internal(float128 a, float128 b,
6946 int is_quiet, float_status *status)
6950 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6951 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6952 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6953 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6955 float128_is_signaling_nan(a, status) ||
6956 float128_is_signaling_nan(b, status)) {
6957 float_raise(float_flag_invalid, status);
6959 return float_relation_unordered;
6961 aSign = extractFloat128Sign( a );
6962 bSign = extractFloat128Sign( b );
6963 if ( aSign != bSign ) {
6964 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6966 return float_relation_equal;
6968 return 1 - (2 * aSign);
6971 if (a.low == b.low && a.high == b.high) {
6972 return float_relation_equal;
6974 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6979 int float128_compare(float128 a, float128 b, float_status *status)
6981 return float128_compare_internal(a, b, 0, status);
6984 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6986 return float128_compare_internal(a, b, 1, status);
6989 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6995 if (floatx80_invalid_encoding(a)) {
6996 float_raise(float_flag_invalid, status);
6997 return floatx80_default_nan(status);
6999 aSig = extractFloatx80Frac( a );
7000 aExp = extractFloatx80Exp( a );
7001 aSign = extractFloatx80Sign( a );
7003 if ( aExp == 0x7FFF ) {
7005 return propagateFloatx80NaN(a, a, status);
7019 } else if (n < -0x10000) {
7024 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7025 aSign, aExp, aSig, 0, status);
7028 float128 float128_scalbn(float128 a, int n, float_status *status)
7032 uint64_t aSig0, aSig1;
7034 aSig1 = extractFloat128Frac1( a );
7035 aSig0 = extractFloat128Frac0( a );
7036 aExp = extractFloat128Exp( a );
7037 aSign = extractFloat128Sign( a );
7038 if ( aExp == 0x7FFF ) {
7039 if ( aSig0 | aSig1 ) {
7040 return propagateFloat128NaN(a, a, status);
7045 aSig0 |= LIT64( 0x0001000000000000 );
7046 } else if (aSig0 == 0 && aSig1 == 0) {
7054 } else if (n < -0x10000) {
7059 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1