]> Git Repo - qemu.git/blame - fpu/softfloat.c
softfloat: Handle default NaN mode after pickNaNMulAdd, not before
[qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
6fff2167 86#include "qemu/bitops.h"
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
88857aca 96#include "fpu/softfloat-macros.h"
158142c2
FB
97
98/*----------------------------------------------------------------------------
99| Functions and definitions to determine: (1) whether tininess for underflow
100| is detected before or after rounding by default, (2) what (if anything)
101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103| are propagated from function inputs to output. These details are target-
104| specific.
105*----------------------------------------------------------------------------*/
106#include "softfloat-specialize.h"
107
bb4d4bb3
PM
108/*----------------------------------------------------------------------------
109| Returns the fraction bits of the half-precision floating-point value `a'.
110*----------------------------------------------------------------------------*/
111
a49db98d 112static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
113{
114 return float16_val(a) & 0x3ff;
115}
116
117/*----------------------------------------------------------------------------
118| Returns the exponent bits of the half-precision floating-point value `a'.
119*----------------------------------------------------------------------------*/
120
0c48262d 121static inline int extractFloat16Exp(float16 a)
bb4d4bb3
PM
122{
123 return (float16_val(a) >> 10) & 0x1f;
124}
125
126/*----------------------------------------------------------------------------
127| Returns the sign bit of the single-precision floating-point value `a'.
128*----------------------------------------------------------------------------*/
129
a49db98d 130static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
131{
132 return float16_val(a)>>15;
133}
134
d97544c9
AB
135/*----------------------------------------------------------------------------
136| Returns the fraction bits of the single-precision floating-point value `a'.
137*----------------------------------------------------------------------------*/
138
139static inline uint32_t extractFloat32Frac(float32 a)
140{
141 return float32_val(a) & 0x007FFFFF;
142}
143
144/*----------------------------------------------------------------------------
145| Returns the exponent bits of the single-precision floating-point value `a'.
146*----------------------------------------------------------------------------*/
147
148static inline int extractFloat32Exp(float32 a)
149{
150 return (float32_val(a) >> 23) & 0xFF;
151}
152
153/*----------------------------------------------------------------------------
154| Returns the sign bit of the single-precision floating-point value `a'.
155*----------------------------------------------------------------------------*/
156
157static inline flag extractFloat32Sign(float32 a)
158{
159 return float32_val(a) >> 31;
160}
161
162/*----------------------------------------------------------------------------
163| Returns the fraction bits of the double-precision floating-point value `a'.
164*----------------------------------------------------------------------------*/
165
166static inline uint64_t extractFloat64Frac(float64 a)
167{
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169}
170
171/*----------------------------------------------------------------------------
172| Returns the exponent bits of the double-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175static inline int extractFloat64Exp(float64 a)
176{
177 return (float64_val(a) >> 52) & 0x7FF;
178}
179
180/*----------------------------------------------------------------------------
181| Returns the sign bit of the double-precision floating-point value `a'.
182*----------------------------------------------------------------------------*/
183
184static inline flag extractFloat64Sign(float64 a)
185{
186 return float64_val(a) >> 63;
187}
188
a90119b5
AB
189/*
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
192 */
193
194typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203} FloatClass;
204
205/*
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
210 *
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
214 */
215
216typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221} FloatParts;
222
223#define DECOMPOSED_BINARY_POINT (64 - 2)
224#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225#define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
226
227/* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
237 */
238typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248} FloatFmt;
249
250/* Expand fields based on the size of exponent and fraction */
251#define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261
262static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
264};
265
266static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
268};
269
270static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
272};
273
6fff2167
AB
274/* Unpack a float to parts, but do not canonicalize. */
275static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276{
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
278
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
284 };
285}
286
287static inline FloatParts float16_unpack_raw(float16 f)
288{
289 return unpack_raw(float16_params, f);
290}
291
292static inline FloatParts float32_unpack_raw(float32 f)
293{
294 return unpack_raw(float32_params, f);
295}
296
297static inline FloatParts float64_unpack_raw(float64 f)
298{
299 return unpack_raw(float64_params, f);
300}
301
302/* Pack a float from parts, but do not canonicalize. */
303static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304{
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
308}
309
310static inline float16 float16_pack_raw(FloatParts p)
311{
312 return make_float16(pack_raw(float16_params, p));
313}
314
315static inline float32 float32_pack_raw(FloatParts p)
316{
317 return make_float32(pack_raw(float32_params, p));
318}
319
320static inline float64 float64_pack_raw(FloatParts p)
321{
322 return make_float64(pack_raw(float64_params, p));
323}
324
325/* Canonicalize EXP and FRAC, setting CLS. */
326static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
328{
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333#ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335#else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
341 }
342#endif
343 }
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
356 }
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361 }
362 return part;
363}
364
365/* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369 */
370
371static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
373{
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
382
383 frac = p.frac;
384 exp = p.exp;
385
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
411 }
412
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
421 }
422 }
423 frac >>= frac_shift;
424
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
433 }
434 }
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
451 }
452 flags |= float_flag_inexact;
453 frac += inc;
454 }
455
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
458
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
461 }
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
464 }
465 }
466 break;
467
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
473
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
479
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
484
485 default:
486 g_assert_not_reached();
487 }
488
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
493}
494
495static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496{
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
498}
499
500static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501{
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
510 }
511}
512
513static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514{
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
516}
517
518static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519{
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
528 }
529}
530
531static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532{
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
534}
535
536static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537{
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
546 }
547}
548
549/* Simple helpers for checking if what NaN we have */
550static bool is_nan(FloatClass c)
551{
552 return unlikely(c >= float_class_qnan);
553}
554static bool is_snan(FloatClass c)
555{
556 return c == float_class_snan;
557}
558static bool is_qnan(FloatClass c)
559{
560 return c == float_class_qnan;
561}
562
dbe4d53a
AB
563static FloatParts return_nan(FloatParts a, float_status *s)
564{
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
573 }
574 break;
575
576 default:
577 g_assert_not_reached();
578 }
579 return a;
580}
581
6fff2167
AB
582static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583{
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
586 }
587
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
596 }
597 a.cls = float_class_msnan;
598 }
599 return a;
600}
601
d446830a
AB
602static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
604{
1839189b
PM
605 int which;
606
d446830a
AB
607 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
608 s->float_exception_flags |= float_flag_invalid;
609 }
610
1839189b
PM
611 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
612 is_qnan(b.cls), is_snan(b.cls),
613 is_qnan(c.cls), is_snan(c.cls),
614 inf_zero, s);
615
d446830a 616 if (s->default_nan_mode) {
1839189b
PM
617 /* Note that this check is after pickNaNMulAdd so that function
618 * has an opportunity to set the Invalid flag.
619 */
d446830a 620 a.cls = float_class_dnan;
1839189b
PM
621 return a;
622 }
d446830a 623
1839189b
PM
624 switch (which) {
625 case 0:
626 break;
627 case 1:
628 a = b;
629 break;
630 case 2:
631 a = c;
632 break;
633 case 3:
634 a.cls = float_class_dnan;
635 return a;
636 default:
637 g_assert_not_reached();
d446830a 638 }
1839189b
PM
639 a.cls = float_class_msnan;
640
d446830a
AB
641 return a;
642}
643
6fff2167
AB
644/*
645 * Returns the result of adding or subtracting the values of the
646 * floating-point values `a' and `b'. The operation is performed
647 * according to the IEC/IEEE Standard for Binary Floating-Point
648 * Arithmetic.
649 */
650
651static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
652 float_status *s)
653{
654 bool a_sign = a.sign;
655 bool b_sign = b.sign ^ subtract;
656
657 if (a_sign != b_sign) {
658 /* Subtraction */
659
660 if (a.cls == float_class_normal && b.cls == float_class_normal) {
661 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
662 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
663 a.frac = a.frac - b.frac;
664 } else {
665 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
666 a.frac = b.frac - a.frac;
667 a.exp = b.exp;
668 a_sign ^= 1;
669 }
670
671 if (a.frac == 0) {
672 a.cls = float_class_zero;
673 a.sign = s->float_rounding_mode == float_round_down;
674 } else {
675 int shift = clz64(a.frac) - 1;
676 a.frac = a.frac << shift;
677 a.exp = a.exp - shift;
678 a.sign = a_sign;
679 }
680 return a;
681 }
682 if (is_nan(a.cls) || is_nan(b.cls)) {
683 return pick_nan(a, b, s);
684 }
685 if (a.cls == float_class_inf) {
686 if (b.cls == float_class_inf) {
687 float_raise(float_flag_invalid, s);
688 a.cls = float_class_dnan;
689 }
690 return a;
691 }
692 if (a.cls == float_class_zero && b.cls == float_class_zero) {
693 a.sign = s->float_rounding_mode == float_round_down;
694 return a;
695 }
696 if (a.cls == float_class_zero || b.cls == float_class_inf) {
697 b.sign = a_sign ^ 1;
698 return b;
699 }
700 if (b.cls == float_class_zero) {
701 return a;
702 }
703 } else {
704 /* Addition */
705 if (a.cls == float_class_normal && b.cls == float_class_normal) {
706 if (a.exp > b.exp) {
707 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
708 } else if (a.exp < b.exp) {
709 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
710 a.exp = b.exp;
711 }
712 a.frac += b.frac;
713 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
714 a.frac >>= 1;
715 a.exp += 1;
716 }
717 return a;
718 }
719 if (is_nan(a.cls) || is_nan(b.cls)) {
720 return pick_nan(a, b, s);
721 }
722 if (a.cls == float_class_inf || b.cls == float_class_zero) {
723 return a;
724 }
725 if (b.cls == float_class_inf || a.cls == float_class_zero) {
726 b.sign = b_sign;
727 return b;
728 }
729 }
730 g_assert_not_reached();
731}
732
733/*
734 * Returns the result of adding or subtracting the floating-point
735 * values `a' and `b'. The operation is performed according to the
736 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
737 */
738
739float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
740 float_status *status)
741{
742 FloatParts pa = float16_unpack_canonical(a, status);
743 FloatParts pb = float16_unpack_canonical(b, status);
744 FloatParts pr = addsub_floats(pa, pb, false, status);
745
746 return float16_round_pack_canonical(pr, status);
747}
748
749float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
750 float_status *status)
751{
752 FloatParts pa = float32_unpack_canonical(a, status);
753 FloatParts pb = float32_unpack_canonical(b, status);
754 FloatParts pr = addsub_floats(pa, pb, false, status);
755
756 return float32_round_pack_canonical(pr, status);
757}
758
759float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
760 float_status *status)
761{
762 FloatParts pa = float64_unpack_canonical(a, status);
763 FloatParts pb = float64_unpack_canonical(b, status);
764 FloatParts pr = addsub_floats(pa, pb, false, status);
765
766 return float64_round_pack_canonical(pr, status);
767}
768
769float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
770 float_status *status)
771{
772 FloatParts pa = float16_unpack_canonical(a, status);
773 FloatParts pb = float16_unpack_canonical(b, status);
774 FloatParts pr = addsub_floats(pa, pb, true, status);
775
776 return float16_round_pack_canonical(pr, status);
777}
778
779float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
780 float_status *status)
781{
782 FloatParts pa = float32_unpack_canonical(a, status);
783 FloatParts pb = float32_unpack_canonical(b, status);
784 FloatParts pr = addsub_floats(pa, pb, true, status);
785
786 return float32_round_pack_canonical(pr, status);
787}
788
789float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
790 float_status *status)
791{
792 FloatParts pa = float64_unpack_canonical(a, status);
793 FloatParts pb = float64_unpack_canonical(b, status);
794 FloatParts pr = addsub_floats(pa, pb, true, status);
795
796 return float64_round_pack_canonical(pr, status);
797}
798
74d707e2
AB
799/*
800 * Returns the result of multiplying the floating-point values `a' and
801 * `b'. The operation is performed according to the IEC/IEEE Standard
802 * for Binary Floating-Point Arithmetic.
803 */
804
805static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
806{
807 bool sign = a.sign ^ b.sign;
808
809 if (a.cls == float_class_normal && b.cls == float_class_normal) {
810 uint64_t hi, lo;
811 int exp = a.exp + b.exp;
812
813 mul64To128(a.frac, b.frac, &hi, &lo);
814 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
815 if (lo & DECOMPOSED_OVERFLOW_BIT) {
816 shift64RightJamming(lo, 1, &lo);
817 exp += 1;
818 }
819
820 /* Re-use a */
821 a.exp = exp;
822 a.sign = sign;
823 a.frac = lo;
824 return a;
825 }
826 /* handle all the NaN cases */
827 if (is_nan(a.cls) || is_nan(b.cls)) {
828 return pick_nan(a, b, s);
829 }
830 /* Inf * Zero == NaN */
831 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
832 (a.cls == float_class_zero && b.cls == float_class_inf)) {
833 s->float_exception_flags |= float_flag_invalid;
834 a.cls = float_class_dnan;
835 a.sign = sign;
836 return a;
837 }
838 /* Multiply by 0 or Inf */
839 if (a.cls == float_class_inf || a.cls == float_class_zero) {
840 a.sign = sign;
841 return a;
842 }
843 if (b.cls == float_class_inf || b.cls == float_class_zero) {
844 b.sign = sign;
845 return b;
846 }
847 g_assert_not_reached();
848}
849
850float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
851 float_status *status)
852{
853 FloatParts pa = float16_unpack_canonical(a, status);
854 FloatParts pb = float16_unpack_canonical(b, status);
855 FloatParts pr = mul_floats(pa, pb, status);
856
857 return float16_round_pack_canonical(pr, status);
858}
859
860float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
861 float_status *status)
862{
863 FloatParts pa = float32_unpack_canonical(a, status);
864 FloatParts pb = float32_unpack_canonical(b, status);
865 FloatParts pr = mul_floats(pa, pb, status);
866
867 return float32_round_pack_canonical(pr, status);
868}
869
870float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
871 float_status *status)
872{
873 FloatParts pa = float64_unpack_canonical(a, status);
874 FloatParts pb = float64_unpack_canonical(b, status);
875 FloatParts pr = mul_floats(pa, pb, status);
876
877 return float64_round_pack_canonical(pr, status);
878}
879
d446830a
AB
880/*
881 * Returns the result of multiplying the floating-point values `a' and
882 * `b' then adding 'c', with no intermediate rounding step after the
883 * multiplication. The operation is performed according to the
884 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
885 * The flags argument allows the caller to select negation of the
886 * addend, the intermediate product, or the final result. (The
887 * difference between this and having the caller do a separate
888 * negation is that negating externally will flip the sign bit on
889 * NaNs.)
890 */
891
892static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
893 int flags, float_status *s)
894{
895 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
896 ((1 << float_class_inf) | (1 << float_class_zero));
897 bool p_sign;
898 bool sign_flip = flags & float_muladd_negate_result;
899 FloatClass p_class;
900 uint64_t hi, lo;
901 int p_exp;
902
903 /* It is implementation-defined whether the cases of (0,inf,qnan)
904 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
905 * they return if they do), so we have to hand this information
906 * off to the target-specific pick-a-NaN routine.
907 */
908 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
909 return pick_nan_muladd(a, b, c, inf_zero, s);
910 }
911
912 if (inf_zero) {
913 s->float_exception_flags |= float_flag_invalid;
914 a.cls = float_class_dnan;
915 return a;
916 }
917
918 if (flags & float_muladd_negate_c) {
919 c.sign ^= 1;
920 }
921
922 p_sign = a.sign ^ b.sign;
923
924 if (flags & float_muladd_negate_product) {
925 p_sign ^= 1;
926 }
927
928 if (a.cls == float_class_inf || b.cls == float_class_inf) {
929 p_class = float_class_inf;
930 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
931 p_class = float_class_zero;
932 } else {
933 p_class = float_class_normal;
934 }
935
936 if (c.cls == float_class_inf) {
937 if (p_class == float_class_inf && p_sign != c.sign) {
938 s->float_exception_flags |= float_flag_invalid;
939 a.cls = float_class_dnan;
940 } else {
941 a.cls = float_class_inf;
942 a.sign = c.sign ^ sign_flip;
943 }
944 return a;
945 }
946
947 if (p_class == float_class_inf) {
948 a.cls = float_class_inf;
949 a.sign = p_sign ^ sign_flip;
950 return a;
951 }
952
953 if (p_class == float_class_zero) {
954 if (c.cls == float_class_zero) {
955 if (p_sign != c.sign) {
956 p_sign = s->float_rounding_mode == float_round_down;
957 }
958 c.sign = p_sign;
959 } else if (flags & float_muladd_halve_result) {
960 c.exp -= 1;
961 }
962 c.sign ^= sign_flip;
963 return c;
964 }
965
966 /* a & b should be normals now... */
967 assert(a.cls == float_class_normal &&
968 b.cls == float_class_normal);
969
970 p_exp = a.exp + b.exp;
971
972 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
973 * result.
974 */
975 mul64To128(a.frac, b.frac, &hi, &lo);
976 /* binary point now at bit 124 */
977
978 /* check for overflow */
979 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
980 shift128RightJamming(hi, lo, 1, &hi, &lo);
981 p_exp += 1;
982 }
983
984 /* + add/sub */
985 if (c.cls == float_class_zero) {
986 /* move binary point back to 62 */
987 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
988 } else {
989 int exp_diff = p_exp - c.exp;
990 if (p_sign == c.sign) {
991 /* Addition */
992 if (exp_diff <= 0) {
993 shift128RightJamming(hi, lo,
994 DECOMPOSED_BINARY_POINT - exp_diff,
995 &hi, &lo);
996 lo += c.frac;
997 p_exp = c.exp;
998 } else {
999 uint64_t c_hi, c_lo;
1000 /* shift c to the same binary point as the product (124) */
1001 c_hi = c.frac >> 2;
1002 c_lo = 0;
1003 shift128RightJamming(c_hi, c_lo,
1004 exp_diff,
1005 &c_hi, &c_lo);
1006 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1007 /* move binary point back to 62 */
1008 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1009 }
1010
1011 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1012 shift64RightJamming(lo, 1, &lo);
1013 p_exp += 1;
1014 }
1015
1016 } else {
1017 /* Subtraction */
1018 uint64_t c_hi, c_lo;
1019 /* make C binary point match product at bit 124 */
1020 c_hi = c.frac >> 2;
1021 c_lo = 0;
1022
1023 if (exp_diff <= 0) {
1024 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1025 if (exp_diff == 0
1026 &&
1027 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1028 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1029 } else {
1030 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1031 p_sign ^= 1;
1032 p_exp = c.exp;
1033 }
1034 } else {
1035 shift128RightJamming(c_hi, c_lo,
1036 exp_diff,
1037 &c_hi, &c_lo);
1038 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1039 }
1040
1041 if (hi == 0 && lo == 0) {
1042 a.cls = float_class_zero;
1043 a.sign = s->float_rounding_mode == float_round_down;
1044 a.sign ^= sign_flip;
1045 return a;
1046 } else {
1047 int shift;
1048 if (hi != 0) {
1049 shift = clz64(hi);
1050 } else {
1051 shift = clz64(lo) + 64;
1052 }
1053 /* Normalizing to a binary point of 124 is the
1054 correct adjust for the exponent. However since we're
1055 shifting, we might as well put the binary point back
1056 at 62 where we really want it. Therefore shift as
1057 if we're leaving 1 bit at the top of the word, but
1058 adjust the exponent as if we're leaving 3 bits. */
1059 shift -= 1;
1060 if (shift >= 64) {
1061 lo = lo << (shift - 64);
1062 } else {
1063 hi = (hi << shift) | (lo >> (64 - shift));
1064 lo = hi | ((lo << shift) != 0);
1065 }
1066 p_exp -= shift - 2;
1067 }
1068 }
1069 }
1070
1071 if (flags & float_muladd_halve_result) {
1072 p_exp -= 1;
1073 }
1074
1075 /* finally prepare our result */
1076 a.cls = float_class_normal;
1077 a.sign = p_sign ^ sign_flip;
1078 a.exp = p_exp;
1079 a.frac = lo;
1080
1081 return a;
1082}
1083
1084float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1085 int flags, float_status *status)
1086{
1087 FloatParts pa = float16_unpack_canonical(a, status);
1088 FloatParts pb = float16_unpack_canonical(b, status);
1089 FloatParts pc = float16_unpack_canonical(c, status);
1090 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1091
1092 return float16_round_pack_canonical(pr, status);
1093}
1094
1095float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1096 int flags, float_status *status)
1097{
1098 FloatParts pa = float32_unpack_canonical(a, status);
1099 FloatParts pb = float32_unpack_canonical(b, status);
1100 FloatParts pc = float32_unpack_canonical(c, status);
1101 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1102
1103 return float32_round_pack_canonical(pr, status);
1104}
1105
1106float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1107 int flags, float_status *status)
1108{
1109 FloatParts pa = float64_unpack_canonical(a, status);
1110 FloatParts pb = float64_unpack_canonical(b, status);
1111 FloatParts pc = float64_unpack_canonical(c, status);
1112 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1113
1114 return float64_round_pack_canonical(pr, status);
1115}
1116
cf07323d
AB
1117/*
1118 * Returns the result of dividing the floating-point value `a' by the
1119 * corresponding value `b'. The operation is performed according to
1120 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1121 */
1122
1123static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1124{
1125 bool sign = a.sign ^ b.sign;
1126
1127 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1128 uint64_t temp_lo, temp_hi;
1129 int exp = a.exp - b.exp;
1130 if (a.frac < b.frac) {
1131 exp -= 1;
1132 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1133 &temp_hi, &temp_lo);
1134 } else {
1135 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1136 &temp_hi, &temp_lo);
1137 }
1138 /* LSB of quot is set if inexact which roundandpack will use
1139 * to set flags. Yet again we re-use a for the result */
1140 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1141 a.sign = sign;
1142 a.exp = exp;
1143 return a;
1144 }
1145 /* handle all the NaN cases */
1146 if (is_nan(a.cls) || is_nan(b.cls)) {
1147 return pick_nan(a, b, s);
1148 }
1149 /* 0/0 or Inf/Inf */
1150 if (a.cls == b.cls
1151 &&
1152 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1153 s->float_exception_flags |= float_flag_invalid;
1154 a.cls = float_class_dnan;
1155 return a;
1156 }
9cb4e398
AB
1157 /* Inf / x or 0 / x */
1158 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1159 a.sign = sign;
1160 return a;
1161 }
cf07323d
AB
1162 /* Div 0 => Inf */
1163 if (b.cls == float_class_zero) {
1164 s->float_exception_flags |= float_flag_divbyzero;
1165 a.cls = float_class_inf;
1166 a.sign = sign;
1167 return a;
1168 }
cf07323d
AB
1169 /* Div by Inf */
1170 if (b.cls == float_class_inf) {
1171 a.cls = float_class_zero;
1172 a.sign = sign;
1173 return a;
1174 }
1175 g_assert_not_reached();
1176}
1177
1178float16 float16_div(float16 a, float16 b, float_status *status)
1179{
1180 FloatParts pa = float16_unpack_canonical(a, status);
1181 FloatParts pb = float16_unpack_canonical(b, status);
1182 FloatParts pr = div_floats(pa, pb, status);
1183
1184 return float16_round_pack_canonical(pr, status);
1185}
1186
1187float32 float32_div(float32 a, float32 b, float_status *status)
1188{
1189 FloatParts pa = float32_unpack_canonical(a, status);
1190 FloatParts pb = float32_unpack_canonical(b, status);
1191 FloatParts pr = div_floats(pa, pb, status);
1192
1193 return float32_round_pack_canonical(pr, status);
1194}
1195
1196float64 float64_div(float64 a, float64 b, float_status *status)
1197{
1198 FloatParts pa = float64_unpack_canonical(a, status);
1199 FloatParts pb = float64_unpack_canonical(b, status);
1200 FloatParts pr = div_floats(pa, pb, status);
1201
1202 return float64_round_pack_canonical(pr, status);
1203}
1204
dbe4d53a
AB
1205/*
1206 * Rounds the floating-point value `a' to an integer, and returns the
1207 * result as a floating-point value. The operation is performed
1208 * according to the IEC/IEEE Standard for Binary Floating-Point
1209 * Arithmetic.
1210 */
1211
1212static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1213{
1214 if (is_nan(a.cls)) {
1215 return return_nan(a, s);
1216 }
1217
1218 switch (a.cls) {
1219 case float_class_zero:
1220 case float_class_inf:
1221 case float_class_qnan:
1222 /* already "integral" */
1223 break;
1224 case float_class_normal:
1225 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1226 /* already integral */
1227 break;
1228 }
1229 if (a.exp < 0) {
1230 bool one;
1231 /* all fractional */
1232 s->float_exception_flags |= float_flag_inexact;
1233 switch (rounding_mode) {
1234 case float_round_nearest_even:
1235 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1236 break;
1237 case float_round_ties_away:
1238 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1239 break;
1240 case float_round_to_zero:
1241 one = false;
1242 break;
1243 case float_round_up:
1244 one = !a.sign;
1245 break;
1246 case float_round_down:
1247 one = a.sign;
1248 break;
1249 default:
1250 g_assert_not_reached();
1251 }
1252
1253 if (one) {
1254 a.frac = DECOMPOSED_IMPLICIT_BIT;
1255 a.exp = 0;
1256 } else {
1257 a.cls = float_class_zero;
1258 }
1259 } else {
1260 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1261 uint64_t frac_lsbm1 = frac_lsb >> 1;
1262 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1263 uint64_t rnd_mask = rnd_even_mask >> 1;
1264 uint64_t inc;
1265
1266 switch (rounding_mode) {
1267 case float_round_nearest_even:
1268 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1269 break;
1270 case float_round_ties_away:
1271 inc = frac_lsbm1;
1272 break;
1273 case float_round_to_zero:
1274 inc = 0;
1275 break;
1276 case float_round_up:
1277 inc = a.sign ? 0 : rnd_mask;
1278 break;
1279 case float_round_down:
1280 inc = a.sign ? rnd_mask : 0;
1281 break;
1282 default:
1283 g_assert_not_reached();
1284 }
1285
1286 if (a.frac & rnd_mask) {
1287 s->float_exception_flags |= float_flag_inexact;
1288 a.frac += inc;
1289 a.frac &= ~rnd_mask;
1290 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1291 a.frac >>= 1;
1292 a.exp++;
1293 }
1294 }
1295 }
1296 break;
1297 default:
1298 g_assert_not_reached();
1299 }
1300 return a;
1301}
1302
1303float16 float16_round_to_int(float16 a, float_status *s)
1304{
1305 FloatParts pa = float16_unpack_canonical(a, s);
1306 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1307 return float16_round_pack_canonical(pr, s);
1308}
1309
1310float32 float32_round_to_int(float32 a, float_status *s)
1311{
1312 FloatParts pa = float32_unpack_canonical(a, s);
1313 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1314 return float32_round_pack_canonical(pr, s);
1315}
1316
1317float64 float64_round_to_int(float64 a, float_status *s)
1318{
1319 FloatParts pa = float64_unpack_canonical(a, s);
1320 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1321 return float64_round_pack_canonical(pr, s);
1322}
1323
1324float64 float64_trunc_to_int(float64 a, float_status *s)
1325{
1326 FloatParts pa = float64_unpack_canonical(a, s);
1327 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1328 return float64_round_pack_canonical(pr, s);
1329}
1330
ab52f973
AB
1331/*
1332 * Returns the result of converting the floating-point value `a' to
1333 * the two's complement integer format. The conversion is performed
1334 * according to the IEC/IEEE Standard for Binary Floating-Point
1335 * Arithmetic---which means in particular that the conversion is
1336 * rounded according to the current rounding mode. If `a' is a NaN,
1337 * the largest positive integer is returned. Otherwise, if the
1338 * conversion overflows, the largest integer with the same sign as `a'
1339 * is returned.
1340*/
1341
1342static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1343 int64_t min, int64_t max,
1344 float_status *s)
1345{
1346 uint64_t r;
1347 int orig_flags = get_float_exception_flags(s);
1348 FloatParts p = round_to_int(in, rmode, s);
1349
1350 switch (p.cls) {
1351 case float_class_snan:
1352 case float_class_qnan:
cffad426
SR
1353 case float_class_dnan:
1354 case float_class_msnan:
801bc563 1355 s->float_exception_flags = orig_flags | float_flag_invalid;
ab52f973
AB
1356 return max;
1357 case float_class_inf:
801bc563 1358 s->float_exception_flags = orig_flags | float_flag_invalid;
ab52f973
AB
1359 return p.sign ? min : max;
1360 case float_class_zero:
1361 return 0;
1362 case float_class_normal:
1363 if (p.exp < DECOMPOSED_BINARY_POINT) {
1364 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1365 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1366 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1367 } else {
1368 r = UINT64_MAX;
1369 }
1370 if (p.sign) {
1371 if (r < -(uint64_t) min) {
1372 return -r;
1373 } else {
1374 s->float_exception_flags = orig_flags | float_flag_invalid;
1375 return min;
1376 }
1377 } else {
1378 if (r < max) {
1379 return r;
1380 } else {
1381 s->float_exception_flags = orig_flags | float_flag_invalid;
1382 return max;
1383 }
1384 }
1385 default:
1386 g_assert_not_reached();
1387 }
1388}
1389
1390#define FLOAT_TO_INT(fsz, isz) \
1391int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1392 float_status *s) \
1393{ \
1394 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1395 return round_to_int_and_pack(p, s->float_rounding_mode, \
1396 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397 s); \
1398} \
1399 \
1400int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1401 (float ## fsz a, float_status *s) \
1402{ \
1403 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1404 return round_to_int_and_pack(p, float_round_to_zero, \
1405 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1406 s); \
1407}
1408
1409FLOAT_TO_INT(16, 16)
1410FLOAT_TO_INT(16, 32)
1411FLOAT_TO_INT(16, 64)
1412
1413FLOAT_TO_INT(32, 16)
1414FLOAT_TO_INT(32, 32)
1415FLOAT_TO_INT(32, 64)
1416
1417FLOAT_TO_INT(64, 16)
1418FLOAT_TO_INT(64, 32)
1419FLOAT_TO_INT(64, 64)
1420
1421#undef FLOAT_TO_INT
1422
1423/*
1424 * Returns the result of converting the floating-point value `a' to
1425 * the unsigned integer format. The conversion is performed according
1426 * to the IEC/IEEE Standard for Binary Floating-Point
1427 * Arithmetic---which means in particular that the conversion is
1428 * rounded according to the current rounding mode. If `a' is a NaN,
1429 * the largest unsigned integer is returned. Otherwise, if the
1430 * conversion overflows, the largest unsigned integer is returned. If
1431 * the 'a' is negative, the result is rounded and zero is returned;
1432 * values that do not round to zero will raise the inexact exception
1433 * flag.
1434 */
1435
1436static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1437 float_status *s)
1438{
1439 int orig_flags = get_float_exception_flags(s);
1440 FloatParts p = round_to_int(in, rmode, s);
1441
1442 switch (p.cls) {
1443 case float_class_snan:
1444 case float_class_qnan:
cffad426
SR
1445 case float_class_dnan:
1446 case float_class_msnan:
ab52f973
AB
1447 s->float_exception_flags = orig_flags | float_flag_invalid;
1448 return max;
1449 case float_class_inf:
801bc563 1450 s->float_exception_flags = orig_flags | float_flag_invalid;
ab52f973
AB
1451 return p.sign ? 0 : max;
1452 case float_class_zero:
1453 return 0;
1454 case float_class_normal:
1455 {
1456 uint64_t r;
1457 if (p.sign) {
1458 s->float_exception_flags = orig_flags | float_flag_invalid;
1459 return 0;
1460 }
1461
1462 if (p.exp < DECOMPOSED_BINARY_POINT) {
1463 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1464 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1465 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1466 } else {
1467 s->float_exception_flags = orig_flags | float_flag_invalid;
1468 return max;
1469 }
1470
1471 /* For uint64 this will never trip, but if p.exp is too large
1472 * to shift a decomposed fraction we shall have exited via the
1473 * 3rd leg above.
1474 */
1475 if (r > max) {
1476 s->float_exception_flags = orig_flags | float_flag_invalid;
1477 return max;
1478 } else {
1479 return r;
1480 }
1481 }
1482 default:
1483 g_assert_not_reached();
1484 }
1485}
1486
1487#define FLOAT_TO_UINT(fsz, isz) \
1488uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1489 float_status *s) \
1490{ \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1493 UINT ## isz ## _MAX, s); \
1494} \
1495 \
1496uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1497 (float ## fsz a, float_status *s) \
1498{ \
1499 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
bd49e602
RH
1500 return round_to_uint_and_pack(p, float_round_to_zero, \
1501 UINT ## isz ## _MAX, s); \
ab52f973
AB
1502}
1503
1504FLOAT_TO_UINT(16, 16)
1505FLOAT_TO_UINT(16, 32)
1506FLOAT_TO_UINT(16, 64)
1507
1508FLOAT_TO_UINT(32, 16)
1509FLOAT_TO_UINT(32, 32)
1510FLOAT_TO_UINT(32, 64)
1511
1512FLOAT_TO_UINT(64, 16)
1513FLOAT_TO_UINT(64, 32)
1514FLOAT_TO_UINT(64, 64)
1515
1516#undef FLOAT_TO_UINT
1517
c02e1fb8
AB
1518/*
1519 * Integer to float conversions
1520 *
1521 * Returns the result of converting the two's complement integer `a'
1522 * to the floating-point format. The conversion is performed according
1523 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524 */
1525
1526static FloatParts int_to_float(int64_t a, float_status *status)
1527{
1528 FloatParts r;
1529 if (a == 0) {
1530 r.cls = float_class_zero;
1531 r.sign = false;
1532 } else if (a == (1ULL << 63)) {
1533 r.cls = float_class_normal;
1534 r.sign = true;
1535 r.frac = DECOMPOSED_IMPLICIT_BIT;
1536 r.exp = 63;
1537 } else {
1538 uint64_t f;
1539 if (a < 0) {
1540 f = -a;
1541 r.sign = true;
1542 } else {
1543 f = a;
1544 r.sign = false;
1545 }
1546 int shift = clz64(f) - 1;
1547 r.cls = float_class_normal;
1548 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1549 r.frac = f << shift;
1550 }
1551
1552 return r;
1553}
1554
1555float16 int64_to_float16(int64_t a, float_status *status)
1556{
1557 FloatParts pa = int_to_float(a, status);
1558 return float16_round_pack_canonical(pa, status);
1559}
1560
1561float16 int32_to_float16(int32_t a, float_status *status)
1562{
1563 return int64_to_float16(a, status);
1564}
1565
1566float16 int16_to_float16(int16_t a, float_status *status)
1567{
1568 return int64_to_float16(a, status);
1569}
1570
1571float32 int64_to_float32(int64_t a, float_status *status)
1572{
1573 FloatParts pa = int_to_float(a, status);
1574 return float32_round_pack_canonical(pa, status);
1575}
1576
1577float32 int32_to_float32(int32_t a, float_status *status)
1578{
1579 return int64_to_float32(a, status);
1580}
1581
1582float32 int16_to_float32(int16_t a, float_status *status)
1583{
1584 return int64_to_float32(a, status);
1585}
1586
1587float64 int64_to_float64(int64_t a, float_status *status)
1588{
1589 FloatParts pa = int_to_float(a, status);
1590 return float64_round_pack_canonical(pa, status);
1591}
1592
1593float64 int32_to_float64(int32_t a, float_status *status)
1594{
1595 return int64_to_float64(a, status);
1596}
1597
1598float64 int16_to_float64(int16_t a, float_status *status)
1599{
1600 return int64_to_float64(a, status);
1601}
1602
1603
1604/*
1605 * Unsigned Integer to float conversions
1606 *
1607 * Returns the result of converting the unsigned integer `a' to the
1608 * floating-point format. The conversion is performed according to the
1609 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1610 */
1611
1612static FloatParts uint_to_float(uint64_t a, float_status *status)
1613{
1614 FloatParts r = { .sign = false};
1615
1616 if (a == 0) {
1617 r.cls = float_class_zero;
1618 } else {
1619 int spare_bits = clz64(a) - 1;
1620 r.cls = float_class_normal;
1621 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1622 if (spare_bits < 0) {
1623 shift64RightJamming(a, -spare_bits, &a);
1624 r.frac = a;
1625 } else {
1626 r.frac = a << spare_bits;
1627 }
1628 }
1629
1630 return r;
1631}
1632
1633float16 uint64_to_float16(uint64_t a, float_status *status)
1634{
1635 FloatParts pa = uint_to_float(a, status);
1636 return float16_round_pack_canonical(pa, status);
1637}
1638
1639float16 uint32_to_float16(uint32_t a, float_status *status)
1640{
1641 return uint64_to_float16(a, status);
1642}
1643
1644float16 uint16_to_float16(uint16_t a, float_status *status)
1645{
1646 return uint64_to_float16(a, status);
1647}
1648
1649float32 uint64_to_float32(uint64_t a, float_status *status)
1650{
1651 FloatParts pa = uint_to_float(a, status);
1652 return float32_round_pack_canonical(pa, status);
1653}
1654
1655float32 uint32_to_float32(uint32_t a, float_status *status)
1656{
1657 return uint64_to_float32(a, status);
1658}
1659
1660float32 uint16_to_float32(uint16_t a, float_status *status)
1661{
1662 return uint64_to_float32(a, status);
1663}
1664
1665float64 uint64_to_float64(uint64_t a, float_status *status)
1666{
1667 FloatParts pa = uint_to_float(a, status);
1668 return float64_round_pack_canonical(pa, status);
1669}
1670
1671float64 uint32_to_float64(uint32_t a, float_status *status)
1672{
1673 return uint64_to_float64(a, status);
1674}
1675
1676float64 uint16_to_float64(uint16_t a, float_status *status)
1677{
1678 return uint64_to_float64(a, status);
1679}
1680
89360067
AB
1681/* Float Min/Max */
1682/* min() and max() functions. These can't be implemented as
1683 * 'compare and pick one input' because that would mishandle
1684 * NaNs and +0 vs -0.
1685 *
1686 * minnum() and maxnum() functions. These are similar to the min()
1687 * and max() functions but if one of the arguments is a QNaN and
1688 * the other is numerical then the numerical argument is returned.
1689 * SNaNs will get quietened before being returned.
1690 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1691 * and maxNum() operations. min() and max() are the typical min/max
1692 * semantics provided by many CPUs which predate that specification.
1693 *
1694 * minnummag() and maxnummag() functions correspond to minNumMag()
1695 * and minNumMag() from the IEEE-754 2008.
1696 */
1697static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1698 bool ieee, bool ismag, float_status *s)
1699{
1700 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1701 if (ieee) {
1702 /* Takes two floating-point values `a' and `b', one of
1703 * which is a NaN, and returns the appropriate NaN
1704 * result. If either `a' or `b' is a signaling NaN,
1705 * the invalid exception is raised.
1706 */
1707 if (is_snan(a.cls) || is_snan(b.cls)) {
1708 return pick_nan(a, b, s);
1709 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1710 return b;
1711 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1712 return a;
1713 }
1714 }
1715 return pick_nan(a, b, s);
1716 } else {
1717 int a_exp, b_exp;
89360067
AB
1718
1719 switch (a.cls) {
1720 case float_class_normal:
1721 a_exp = a.exp;
1722 break;
1723 case float_class_inf:
1724 a_exp = INT_MAX;
1725 break;
1726 case float_class_zero:
1727 a_exp = INT_MIN;
1728 break;
1729 default:
1730 g_assert_not_reached();
1731 break;
1732 }
1733 switch (b.cls) {
1734 case float_class_normal:
1735 b_exp = b.exp;
1736 break;
1737 case float_class_inf:
1738 b_exp = INT_MAX;
1739 break;
1740 case float_class_zero:
1741 b_exp = INT_MIN;
1742 break;
1743 default:
1744 g_assert_not_reached();
1745 break;
1746 }
1747
6245327a
EC
1748 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1749 bool a_less = a_exp < b_exp;
1750 if (a_exp == b_exp) {
1751 a_less = a.frac < b.frac;
1752 }
1753 return a_less ^ ismin ? b : a;
89360067
AB
1754 }
1755
6245327a 1756 if (a.sign == b.sign) {
89360067
AB
1757 bool a_less = a_exp < b_exp;
1758 if (a_exp == b_exp) {
1759 a_less = a.frac < b.frac;
1760 }
6245327a 1761 return a.sign ^ a_less ^ ismin ? b : a;
89360067 1762 } else {
6245327a 1763 return a.sign ^ ismin ? b : a;
89360067
AB
1764 }
1765 }
1766}
1767
1768#define MINMAX(sz, name, ismin, isiee, ismag) \
1769float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1770 float_status *s) \
1771{ \
1772 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1773 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1774 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1775 \
1776 return float ## sz ## _round_pack_canonical(pr, s); \
1777}
1778
1779MINMAX(16, min, true, false, false)
1780MINMAX(16, minnum, true, true, false)
1781MINMAX(16, minnummag, true, true, true)
1782MINMAX(16, max, false, false, false)
1783MINMAX(16, maxnum, false, true, false)
1784MINMAX(16, maxnummag, false, true, true)
1785
1786MINMAX(32, min, true, false, false)
1787MINMAX(32, minnum, true, true, false)
1788MINMAX(32, minnummag, true, true, true)
1789MINMAX(32, max, false, false, false)
1790MINMAX(32, maxnum, false, true, false)
1791MINMAX(32, maxnummag, false, true, true)
1792
1793MINMAX(64, min, true, false, false)
1794MINMAX(64, minnum, true, true, false)
1795MINMAX(64, minnummag, true, true, true)
1796MINMAX(64, max, false, false, false)
1797MINMAX(64, maxnum, false, true, false)
1798MINMAX(64, maxnummag, false, true, true)
1799
1800#undef MINMAX
1801
0c4c9092
AB
1802/* Floating point compare */
1803static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1804 float_status *s)
1805{
1806 if (is_nan(a.cls) || is_nan(b.cls)) {
1807 if (!is_quiet ||
1808 a.cls == float_class_snan ||
1809 b.cls == float_class_snan) {
1810 s->float_exception_flags |= float_flag_invalid;
1811 }
1812 return float_relation_unordered;
1813 }
1814
1815 if (a.cls == float_class_zero) {
1816 if (b.cls == float_class_zero) {
1817 return float_relation_equal;
1818 }
1819 return b.sign ? float_relation_greater : float_relation_less;
1820 } else if (b.cls == float_class_zero) {
1821 return a.sign ? float_relation_less : float_relation_greater;
1822 }
1823
1824 /* The only really important thing about infinity is its sign. If
1825 * both are infinities the sign marks the smallest of the two.
1826 */
1827 if (a.cls == float_class_inf) {
1828 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1829 return float_relation_equal;
1830 }
1831 return a.sign ? float_relation_less : float_relation_greater;
1832 } else if (b.cls == float_class_inf) {
1833 return b.sign ? float_relation_greater : float_relation_less;
1834 }
1835
1836 if (a.sign != b.sign) {
1837 return a.sign ? float_relation_less : float_relation_greater;
1838 }
1839
1840 if (a.exp == b.exp) {
1841 if (a.frac == b.frac) {
1842 return float_relation_equal;
1843 }
1844 if (a.sign) {
1845 return a.frac > b.frac ?
1846 float_relation_less : float_relation_greater;
1847 } else {
1848 return a.frac > b.frac ?
1849 float_relation_greater : float_relation_less;
1850 }
1851 } else {
1852 if (a.sign) {
1853 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1854 } else {
1855 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1856 }
1857 }
1858}
1859
1860#define COMPARE(sz) \
1861int float ## sz ## _compare(float ## sz a, float ## sz b, \
1862 float_status *s) \
1863{ \
1864 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1865 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1866 return compare_floats(pa, pb, false, s); \
1867} \
1868int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1869 float_status *s) \
1870{ \
1871 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1872 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1873 return compare_floats(pa, pb, true, s); \
1874}
1875
1876COMPARE(16)
1877COMPARE(32)
1878COMPARE(64)
1879
1880#undef COMPARE
1881
0bfc9f19
AB
1882/* Multiply A by 2 raised to the power N. */
1883static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1884{
1885 if (unlikely(is_nan(a.cls))) {
1886 return return_nan(a, s);
1887 }
1888 if (a.cls == float_class_normal) {
ce8d4082
RH
1889 /* The largest float type (even though not supported by FloatParts)
1890 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1891 * still allows rounding to infinity, without allowing overflow
1892 * within the int32_t that backs FloatParts.exp.
1893 */
1894 n = MIN(MAX(n, -0x10000), 0x10000);
0bfc9f19
AB
1895 a.exp += n;
1896 }
1897 return a;
1898}
1899
1900float16 float16_scalbn(float16 a, int n, float_status *status)
1901{
1902 FloatParts pa = float16_unpack_canonical(a, status);
1903 FloatParts pr = scalbn_decomposed(pa, n, status);
1904 return float16_round_pack_canonical(pr, status);
1905}
1906
1907float32 float32_scalbn(float32 a, int n, float_status *status)
1908{
1909 FloatParts pa = float32_unpack_canonical(a, status);
1910 FloatParts pr = scalbn_decomposed(pa, n, status);
1911 return float32_round_pack_canonical(pr, status);
1912}
1913
1914float64 float64_scalbn(float64 a, int n, float_status *status)
1915{
1916 FloatParts pa = float64_unpack_canonical(a, status);
1917 FloatParts pr = scalbn_decomposed(pa, n, status);
1918 return float64_round_pack_canonical(pr, status);
1919}
1920
c13bb2da
AB
1921/*
1922 * Square Root
1923 *
1924 * The old softfloat code did an approximation step before zeroing in
1925 * on the final result. However for simpleness we just compute the
1926 * square root by iterating down from the implicit bit to enough extra
1927 * bits to ensure we get a correctly rounded result.
1928 *
1929 * This does mean however the calculation is slower than before,
1930 * especially for 64 bit floats.
1931 */
1932
1933static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1934{
1935 uint64_t a_frac, r_frac, s_frac;
1936 int bit, last_bit;
1937
1938 if (is_nan(a.cls)) {
1939 return return_nan(a, s);
1940 }
1941 if (a.cls == float_class_zero) {
1942 return a; /* sqrt(+-0) = +-0 */
1943 }
1944 if (a.sign) {
1945 s->float_exception_flags |= float_flag_invalid;
1946 a.cls = float_class_dnan;
1947 return a;
1948 }
1949 if (a.cls == float_class_inf) {
1950 return a; /* sqrt(+inf) = +inf */
1951 }
1952
1953 assert(a.cls == float_class_normal);
1954
1955 /* We need two overflow bits at the top. Adding room for that is a
1956 * right shift. If the exponent is odd, we can discard the low bit
1957 * by multiplying the fraction by 2; that's a left shift. Combine
1958 * those and we shift right if the exponent is even.
1959 */
1960 a_frac = a.frac;
1961 if (!(a.exp & 1)) {
1962 a_frac >>= 1;
1963 }
1964 a.exp >>= 1;
1965
1966 /* Bit-by-bit computation of sqrt. */
1967 r_frac = 0;
1968 s_frac = 0;
1969
1970 /* Iterate from implicit bit down to the 3 extra bits to compute a
1971 * properly rounded result. Remember we've inserted one more bit
1972 * at the top, so these positions are one less.
1973 */
1974 bit = DECOMPOSED_BINARY_POINT - 1;
1975 last_bit = MAX(p->frac_shift - 4, 0);
1976 do {
1977 uint64_t q = 1ULL << bit;
1978 uint64_t t_frac = s_frac + q;
1979 if (t_frac <= a_frac) {
1980 s_frac = t_frac + q;
1981 a_frac -= t_frac;
1982 r_frac += q;
1983 }
1984 a_frac <<= 1;
1985 } while (--bit >= last_bit);
1986
1987 /* Undo the right shift done above. If there is any remaining
1988 * fraction, the result is inexact. Set the sticky bit.
1989 */
1990 a.frac = (r_frac << 1) + (a_frac != 0);
1991
1992 return a;
1993}
1994
1995float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1996{
1997 FloatParts pa = float16_unpack_canonical(a, status);
1998 FloatParts pr = sqrt_float(pa, status, &float16_params);
1999 return float16_round_pack_canonical(pr, status);
2000}
2001
2002float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2003{
2004 FloatParts pa = float32_unpack_canonical(a, status);
2005 FloatParts pr = sqrt_float(pa, status, &float32_params);
2006 return float32_round_pack_canonical(pr, status);
2007}
2008
2009float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2010{
2011 FloatParts pa = float64_unpack_canonical(a, status);
2012 FloatParts pr = sqrt_float(pa, status, &float64_params);
2013 return float64_round_pack_canonical(pr, status);
2014}
2015
2016
158142c2
FB
2017/*----------------------------------------------------------------------------
2018| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2019| and 7, and returns the properly rounded 32-bit integer corresponding to the
2020| input. If `zSign' is 1, the input is negated before being converted to an
2021| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2022| is simply rounded to an integer, with the inexact exception raised if the
2023| input cannot be represented exactly as an integer. However, if the fixed-
2024| point input is too large, the invalid exception is raised and the largest
2025| positive or negative integer is returned.
2026*----------------------------------------------------------------------------*/
2027
f4014512 2028static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 2029{
8f506c70 2030 int8_t roundingMode;
158142c2 2031 flag roundNearestEven;
8f506c70 2032 int8_t roundIncrement, roundBits;
760e1416 2033 int32_t z;
158142c2 2034
a2f2d288 2035 roundingMode = status->float_rounding_mode;
158142c2 2036 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2037 switch (roundingMode) {
2038 case float_round_nearest_even:
f9288a76 2039 case float_round_ties_away:
dc355b76
PM
2040 roundIncrement = 0x40;
2041 break;
2042 case float_round_to_zero:
2043 roundIncrement = 0;
2044 break;
2045 case float_round_up:
2046 roundIncrement = zSign ? 0 : 0x7f;
2047 break;
2048 case float_round_down:
2049 roundIncrement = zSign ? 0x7f : 0;
2050 break;
2051 default:
2052 abort();
158142c2
FB
2053 }
2054 roundBits = absZ & 0x7F;
2055 absZ = ( absZ + roundIncrement )>>7;
2056 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2057 z = absZ;
2058 if ( zSign ) z = - z;
2059 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 2060 float_raise(float_flag_invalid, status);
bb98fe42 2061 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 2062 }
a2f2d288
PM
2063 if (roundBits) {
2064 status->float_exception_flags |= float_flag_inexact;
2065 }
158142c2
FB
2066 return z;
2067
2068}
2069
2070/*----------------------------------------------------------------------------
2071| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2072| `absZ1', with binary point between bits 63 and 64 (between the input words),
2073| and returns the properly rounded 64-bit integer corresponding to the input.
2074| If `zSign' is 1, the input is negated before being converted to an integer.
2075| Ordinarily, the fixed-point input is simply rounded to an integer, with
2076| the inexact exception raised if the input cannot be represented exactly as
2077| an integer. However, if the fixed-point input is too large, the invalid
2078| exception is raised and the largest positive or negative integer is
2079| returned.
2080*----------------------------------------------------------------------------*/
2081
f42c2224 2082static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 2083 float_status *status)
158142c2 2084{
8f506c70 2085 int8_t roundingMode;
158142c2 2086 flag roundNearestEven, increment;
760e1416 2087 int64_t z;
158142c2 2088
a2f2d288 2089 roundingMode = status->float_rounding_mode;
158142c2 2090 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2091 switch (roundingMode) {
2092 case float_round_nearest_even:
f9288a76 2093 case float_round_ties_away:
dc355b76
PM
2094 increment = ((int64_t) absZ1 < 0);
2095 break;
2096 case float_round_to_zero:
2097 increment = 0;
2098 break;
2099 case float_round_up:
2100 increment = !zSign && absZ1;
2101 break;
2102 case float_round_down:
2103 increment = zSign && absZ1;
2104 break;
2105 default:
2106 abort();
158142c2
FB
2107 }
2108 if ( increment ) {
2109 ++absZ0;
2110 if ( absZ0 == 0 ) goto overflow;
bb98fe42 2111 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
2112 }
2113 z = absZ0;
2114 if ( zSign ) z = - z;
2115 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2116 overflow:
ff32e16e 2117 float_raise(float_flag_invalid, status);
158142c2 2118 return
bb98fe42 2119 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
2120 : LIT64( 0x7FFFFFFFFFFFFFFF );
2121 }
a2f2d288
PM
2122 if (absZ1) {
2123 status->float_exception_flags |= float_flag_inexact;
2124 }
158142c2
FB
2125 return z;
2126
2127}
2128
fb3ea83a
TM
2129/*----------------------------------------------------------------------------
2130| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2131| `absZ1', with binary point between bits 63 and 64 (between the input words),
2132| and returns the properly rounded 64-bit unsigned integer corresponding to the
2133| input. Ordinarily, the fixed-point input is simply rounded to an integer,
2134| with the inexact exception raised if the input cannot be represented exactly
2135| as an integer. However, if the fixed-point input is too large, the invalid
2136| exception is raised and the largest unsigned integer is returned.
2137*----------------------------------------------------------------------------*/
2138
f42c2224 2139static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 2140 uint64_t absZ1, float_status *status)
fb3ea83a 2141{
8f506c70 2142 int8_t roundingMode;
fb3ea83a
TM
2143 flag roundNearestEven, increment;
2144
a2f2d288 2145 roundingMode = status->float_rounding_mode;
fb3ea83a 2146 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
2147 switch (roundingMode) {
2148 case float_round_nearest_even:
f9288a76 2149 case float_round_ties_away:
dc355b76
PM
2150 increment = ((int64_t)absZ1 < 0);
2151 break;
2152 case float_round_to_zero:
2153 increment = 0;
2154 break;
2155 case float_round_up:
2156 increment = !zSign && absZ1;
2157 break;
2158 case float_round_down:
2159 increment = zSign && absZ1;
2160 break;
2161 default:
2162 abort();
fb3ea83a
TM
2163 }
2164 if (increment) {
2165 ++absZ0;
2166 if (absZ0 == 0) {
ff32e16e 2167 float_raise(float_flag_invalid, status);
fb3ea83a
TM
2168 return LIT64(0xFFFFFFFFFFFFFFFF);
2169 }
2170 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2171 }
2172
2173 if (zSign && absZ0) {
ff32e16e 2174 float_raise(float_flag_invalid, status);
fb3ea83a
TM
2175 return 0;
2176 }
2177
2178 if (absZ1) {
a2f2d288 2179 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
2180 }
2181 return absZ0;
2182}
2183
37d18660
PM
2184/*----------------------------------------------------------------------------
2185| If `a' is denormal and we are in flush-to-zero mode then set the
2186| input-denormal exception and return zero. Otherwise just return the value.
2187*----------------------------------------------------------------------------*/
e5a41ffa 2188float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 2189{
a2f2d288 2190 if (status->flush_inputs_to_zero) {
37d18660 2191 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 2192 float_raise(float_flag_input_denormal, status);
37d18660
PM
2193 return make_float32(float32_val(a) & 0x80000000);
2194 }
2195 }
2196 return a;
2197}
2198
158142c2
FB
2199/*----------------------------------------------------------------------------
2200| Normalizes the subnormal single-precision floating-point value represented
2201| by the denormalized significand `aSig'. The normalized exponent and
2202| significand are stored at the locations pointed to by `zExpPtr' and
2203| `zSigPtr', respectively.
2204*----------------------------------------------------------------------------*/
2205
2206static void
0c48262d 2207 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
158142c2 2208{
8f506c70 2209 int8_t shiftCount;
158142c2
FB
2210
2211 shiftCount = countLeadingZeros32( aSig ) - 8;
2212 *zSigPtr = aSig<<shiftCount;
2213 *zExpPtr = 1 - shiftCount;
2214
2215}
2216
158142c2
FB
2217/*----------------------------------------------------------------------------
2218| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2219| and significand `zSig', and returns the proper single-precision floating-
2220| point value corresponding to the abstract input. Ordinarily, the abstract
2221| value is simply rounded and packed into the single-precision format, with
2222| the inexact exception raised if the abstract input cannot be represented
2223| exactly. However, if the abstract value is too large, the overflow and
2224| inexact exceptions are raised and an infinity or maximal finite value is
2225| returned. If the abstract value is too small, the input value is rounded to
2226| a subnormal number, and the underflow and inexact exceptions are raised if
2227| the abstract input cannot be represented exactly as a subnormal single-
2228| precision floating-point number.
2229| The input significand `zSig' has its binary point between bits 30
2230| and 29, which is 7 bits to the left of the usual location. This shifted
2231| significand must be normalized or smaller. If `zSig' is not normalized,
2232| `zExp' must be 0; in that case, the result returned is a subnormal number,
2233| and it must not require rounding. In the usual case that `zSig' is
2234| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2235| The handling of underflow and overflow follows the IEC/IEEE Standard for
2236| Binary Floating-Point Arithmetic.
2237*----------------------------------------------------------------------------*/
2238
0c48262d 2239static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 2240 float_status *status)
158142c2 2241{
8f506c70 2242 int8_t roundingMode;
158142c2 2243 flag roundNearestEven;
8f506c70 2244 int8_t roundIncrement, roundBits;
158142c2
FB
2245 flag isTiny;
2246
a2f2d288 2247 roundingMode = status->float_rounding_mode;
158142c2 2248 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2249 switch (roundingMode) {
2250 case float_round_nearest_even:
f9288a76 2251 case float_round_ties_away:
dc355b76
PM
2252 roundIncrement = 0x40;
2253 break;
2254 case float_round_to_zero:
2255 roundIncrement = 0;
2256 break;
2257 case float_round_up:
2258 roundIncrement = zSign ? 0 : 0x7f;
2259 break;
2260 case float_round_down:
2261 roundIncrement = zSign ? 0x7f : 0;
2262 break;
2263 default:
2264 abort();
2265 break;
158142c2
FB
2266 }
2267 roundBits = zSig & 0x7F;
bb98fe42 2268 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
2269 if ( ( 0xFD < zExp )
2270 || ( ( zExp == 0xFD )
bb98fe42 2271 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 2272 ) {
ff32e16e 2273 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 2274 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
2275 }
2276 if ( zExp < 0 ) {
a2f2d288 2277 if (status->flush_to_zero) {
ff32e16e 2278 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2279 return packFloat32(zSign, 0, 0);
2280 }
158142c2 2281 isTiny =
a2f2d288
PM
2282 (status->float_detect_tininess
2283 == float_tininess_before_rounding)
158142c2
FB
2284 || ( zExp < -1 )
2285 || ( zSig + roundIncrement < 0x80000000 );
2286 shift32RightJamming( zSig, - zExp, &zSig );
2287 zExp = 0;
2288 roundBits = zSig & 0x7F;
ff32e16e
PM
2289 if (isTiny && roundBits) {
2290 float_raise(float_flag_underflow, status);
2291 }
158142c2
FB
2292 }
2293 }
a2f2d288
PM
2294 if (roundBits) {
2295 status->float_exception_flags |= float_flag_inexact;
2296 }
158142c2
FB
2297 zSig = ( zSig + roundIncrement )>>7;
2298 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2299 if ( zSig == 0 ) zExp = 0;
2300 return packFloat32( zSign, zExp, zSig );
2301
2302}
2303
2304/*----------------------------------------------------------------------------
2305| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2306| and significand `zSig', and returns the proper single-precision floating-
2307| point value corresponding to the abstract input. This routine is just like
2308| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2309| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2310| floating-point exponent.
2311*----------------------------------------------------------------------------*/
2312
2313static float32
0c48262d 2314 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 2315 float_status *status)
158142c2 2316{
8f506c70 2317 int8_t shiftCount;
158142c2
FB
2318
2319 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
2320 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2321 status);
158142c2
FB
2322
2323}
2324
37d18660
PM
2325/*----------------------------------------------------------------------------
2326| If `a' is denormal and we are in flush-to-zero mode then set the
2327| input-denormal exception and return zero. Otherwise just return the value.
2328*----------------------------------------------------------------------------*/
e5a41ffa 2329float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 2330{
a2f2d288 2331 if (status->flush_inputs_to_zero) {
37d18660 2332 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 2333 float_raise(float_flag_input_denormal, status);
37d18660
PM
2334 return make_float64(float64_val(a) & (1ULL << 63));
2335 }
2336 }
2337 return a;
2338}
2339
158142c2
FB
2340/*----------------------------------------------------------------------------
2341| Normalizes the subnormal double-precision floating-point value represented
2342| by the denormalized significand `aSig'. The normalized exponent and
2343| significand are stored at the locations pointed to by `zExpPtr' and
2344| `zSigPtr', respectively.
2345*----------------------------------------------------------------------------*/
2346
2347static void
0c48262d 2348 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
158142c2 2349{
8f506c70 2350 int8_t shiftCount;
158142c2
FB
2351
2352 shiftCount = countLeadingZeros64( aSig ) - 11;
2353 *zSigPtr = aSig<<shiftCount;
2354 *zExpPtr = 1 - shiftCount;
2355
2356}
2357
2358/*----------------------------------------------------------------------------
2359| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2360| double-precision floating-point value, returning the result. After being
2361| shifted into the proper positions, the three fields are simply added
2362| together to form the result. This means that any integer portion of `zSig'
2363| will be added into the exponent. Since a properly normalized significand
2364| will have an integer portion equal to 1, the `zExp' input should be 1 less
2365| than the desired result exponent whenever `zSig' is a complete, normalized
2366| significand.
2367*----------------------------------------------------------------------------*/
2368
0c48262d 2369static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
158142c2
FB
2370{
2371
f090c9d4 2372 return make_float64(
bb98fe42 2373 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
2374
2375}
2376
2377/*----------------------------------------------------------------------------
2378| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2379| and significand `zSig', and returns the proper double-precision floating-
2380| point value corresponding to the abstract input. Ordinarily, the abstract
2381| value is simply rounded and packed into the double-precision format, with
2382| the inexact exception raised if the abstract input cannot be represented
2383| exactly. However, if the abstract value is too large, the overflow and
2384| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
2385| returned. If the abstract value is too small, the input value is rounded to
2386| a subnormal number, and the underflow and inexact exceptions are raised if
2387| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
2388| precision floating-point number.
2389| The input significand `zSig' has its binary point between bits 62
2390| and 61, which is 10 bits to the left of the usual location. This shifted
2391| significand must be normalized or smaller. If `zSig' is not normalized,
2392| `zExp' must be 0; in that case, the result returned is a subnormal number,
2393| and it must not require rounding. In the usual case that `zSig' is
2394| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2395| The handling of underflow and overflow follows the IEC/IEEE Standard for
2396| Binary Floating-Point Arithmetic.
2397*----------------------------------------------------------------------------*/
2398
0c48262d 2399static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 2400 float_status *status)
158142c2 2401{
8f506c70 2402 int8_t roundingMode;
158142c2 2403 flag roundNearestEven;
0c48262d 2404 int roundIncrement, roundBits;
158142c2
FB
2405 flag isTiny;
2406
a2f2d288 2407 roundingMode = status->float_rounding_mode;
158142c2 2408 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2409 switch (roundingMode) {
2410 case float_round_nearest_even:
f9288a76 2411 case float_round_ties_away:
dc355b76
PM
2412 roundIncrement = 0x200;
2413 break;
2414 case float_round_to_zero:
2415 roundIncrement = 0;
2416 break;
2417 case float_round_up:
2418 roundIncrement = zSign ? 0 : 0x3ff;
2419 break;
2420 case float_round_down:
2421 roundIncrement = zSign ? 0x3ff : 0;
2422 break;
9ee6f678
BR
2423 case float_round_to_odd:
2424 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2425 break;
dc355b76
PM
2426 default:
2427 abort();
158142c2
FB
2428 }
2429 roundBits = zSig & 0x3FF;
bb98fe42 2430 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
2431 if ( ( 0x7FD < zExp )
2432 || ( ( zExp == 0x7FD )
bb98fe42 2433 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 2434 ) {
9ee6f678
BR
2435 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2436 roundIncrement != 0;
ff32e16e 2437 float_raise(float_flag_overflow | float_flag_inexact, status);
9ee6f678 2438 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
158142c2
FB
2439 }
2440 if ( zExp < 0 ) {
a2f2d288 2441 if (status->flush_to_zero) {
ff32e16e 2442 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2443 return packFloat64(zSign, 0, 0);
2444 }
158142c2 2445 isTiny =
a2f2d288
PM
2446 (status->float_detect_tininess
2447 == float_tininess_before_rounding)
158142c2
FB
2448 || ( zExp < -1 )
2449 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2450 shift64RightJamming( zSig, - zExp, &zSig );
2451 zExp = 0;
2452 roundBits = zSig & 0x3FF;
ff32e16e
PM
2453 if (isTiny && roundBits) {
2454 float_raise(float_flag_underflow, status);
2455 }
9ee6f678
BR
2456 if (roundingMode == float_round_to_odd) {
2457 /*
2458 * For round-to-odd case, the roundIncrement depends on
2459 * zSig which just changed.
2460 */
2461 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2462 }
158142c2
FB
2463 }
2464 }
a2f2d288
PM
2465 if (roundBits) {
2466 status->float_exception_flags |= float_flag_inexact;
2467 }
158142c2
FB
2468 zSig = ( zSig + roundIncrement )>>10;
2469 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2470 if ( zSig == 0 ) zExp = 0;
2471 return packFloat64( zSign, zExp, zSig );
2472
2473}
2474
2475/*----------------------------------------------------------------------------
2476| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2477| and significand `zSig', and returns the proper double-precision floating-
2478| point value corresponding to the abstract input. This routine is just like
2479| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2480| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2481| floating-point exponent.
2482*----------------------------------------------------------------------------*/
2483
2484static float64
0c48262d 2485 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 2486 float_status *status)
158142c2 2487{
8f506c70 2488 int8_t shiftCount;
158142c2
FB
2489
2490 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
2491 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2492 status);
158142c2
FB
2493
2494}
2495
158142c2
FB
2496/*----------------------------------------------------------------------------
2497| Normalizes the subnormal extended double-precision floating-point value
2498| represented by the denormalized significand `aSig'. The normalized exponent
2499| and significand are stored at the locations pointed to by `zExpPtr' and
2500| `zSigPtr', respectively.
2501*----------------------------------------------------------------------------*/
2502
88857aca
LV
2503void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2504 uint64_t *zSigPtr)
158142c2 2505{
8f506c70 2506 int8_t shiftCount;
158142c2
FB
2507
2508 shiftCount = countLeadingZeros64( aSig );
2509 *zSigPtr = aSig<<shiftCount;
2510 *zExpPtr = 1 - shiftCount;
158142c2
FB
2511}
2512
2513/*----------------------------------------------------------------------------
2514| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2515| and extended significand formed by the concatenation of `zSig0' and `zSig1',
2516| and returns the proper extended double-precision floating-point value
2517| corresponding to the abstract input. Ordinarily, the abstract value is
2518| rounded and packed into the extended double-precision format, with the
2519| inexact exception raised if the abstract input cannot be represented
2520| exactly. However, if the abstract value is too large, the overflow and
2521| inexact exceptions are raised and an infinity or maximal finite value is
2522| returned. If the abstract value is too small, the input value is rounded to
2523| a subnormal number, and the underflow and inexact exceptions are raised if
2524| the abstract input cannot be represented exactly as a subnormal extended
2525| double-precision floating-point number.
2526| If `roundingPrecision' is 32 or 64, the result is rounded to the same
2527| number of bits as single or double precision, respectively. Otherwise, the
2528| result is rounded to the full precision of the extended double-precision
2529| format.
2530| The input significand must be normalized or smaller. If the input
2531| significand is not normalized, `zExp' must be 0; in that case, the result
2532| returned is a subnormal number, and it must not require rounding. The
2533| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2534| Floating-Point Arithmetic.
2535*----------------------------------------------------------------------------*/
2536
88857aca
LV
2537floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2538 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2539 float_status *status)
158142c2 2540{
8f506c70 2541 int8_t roundingMode;
158142c2 2542 flag roundNearestEven, increment, isTiny;
f42c2224 2543 int64_t roundIncrement, roundMask, roundBits;
158142c2 2544
a2f2d288 2545 roundingMode = status->float_rounding_mode;
158142c2
FB
2546 roundNearestEven = ( roundingMode == float_round_nearest_even );
2547 if ( roundingPrecision == 80 ) goto precision80;
2548 if ( roundingPrecision == 64 ) {
2549 roundIncrement = LIT64( 0x0000000000000400 );
2550 roundMask = LIT64( 0x00000000000007FF );
2551 }
2552 else if ( roundingPrecision == 32 ) {
2553 roundIncrement = LIT64( 0x0000008000000000 );
2554 roundMask = LIT64( 0x000000FFFFFFFFFF );
2555 }
2556 else {
2557 goto precision80;
2558 }
2559 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
2560 switch (roundingMode) {
2561 case float_round_nearest_even:
f9288a76 2562 case float_round_ties_away:
dc355b76
PM
2563 break;
2564 case float_round_to_zero:
2565 roundIncrement = 0;
2566 break;
2567 case float_round_up:
2568 roundIncrement = zSign ? 0 : roundMask;
2569 break;
2570 case float_round_down:
2571 roundIncrement = zSign ? roundMask : 0;
2572 break;
2573 default:
2574 abort();
158142c2
FB
2575 }
2576 roundBits = zSig0 & roundMask;
bb98fe42 2577 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2578 if ( ( 0x7FFE < zExp )
2579 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2580 ) {
2581 goto overflow;
2582 }
2583 if ( zExp <= 0 ) {
a2f2d288 2584 if (status->flush_to_zero) {
ff32e16e 2585 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2586 return packFloatx80(zSign, 0, 0);
2587 }
158142c2 2588 isTiny =
a2f2d288
PM
2589 (status->float_detect_tininess
2590 == float_tininess_before_rounding)
158142c2
FB
2591 || ( zExp < 0 )
2592 || ( zSig0 <= zSig0 + roundIncrement );
2593 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2594 zExp = 0;
2595 roundBits = zSig0 & roundMask;
ff32e16e
PM
2596 if (isTiny && roundBits) {
2597 float_raise(float_flag_underflow, status);
2598 }
a2f2d288
PM
2599 if (roundBits) {
2600 status->float_exception_flags |= float_flag_inexact;
2601 }
158142c2 2602 zSig0 += roundIncrement;
bb98fe42 2603 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2604 roundIncrement = roundMask + 1;
2605 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2606 roundMask |= roundIncrement;
2607 }
2608 zSig0 &= ~ roundMask;
2609 return packFloatx80( zSign, zExp, zSig0 );
2610 }
2611 }
a2f2d288
PM
2612 if (roundBits) {
2613 status->float_exception_flags |= float_flag_inexact;
2614 }
158142c2
FB
2615 zSig0 += roundIncrement;
2616 if ( zSig0 < roundIncrement ) {
2617 ++zExp;
2618 zSig0 = LIT64( 0x8000000000000000 );
2619 }
2620 roundIncrement = roundMask + 1;
2621 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2622 roundMask |= roundIncrement;
2623 }
2624 zSig0 &= ~ roundMask;
2625 if ( zSig0 == 0 ) zExp = 0;
2626 return packFloatx80( zSign, zExp, zSig0 );
2627 precision80:
dc355b76
PM
2628 switch (roundingMode) {
2629 case float_round_nearest_even:
f9288a76 2630 case float_round_ties_away:
dc355b76
PM
2631 increment = ((int64_t)zSig1 < 0);
2632 break;
2633 case float_round_to_zero:
2634 increment = 0;
2635 break;
2636 case float_round_up:
2637 increment = !zSign && zSig1;
2638 break;
2639 case float_round_down:
2640 increment = zSign && zSig1;
2641 break;
2642 default:
2643 abort();
158142c2 2644 }
bb98fe42 2645 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2646 if ( ( 0x7FFE < zExp )
2647 || ( ( zExp == 0x7FFE )
2648 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2649 && increment
2650 )
2651 ) {
2652 roundMask = 0;
2653 overflow:
ff32e16e 2654 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2655 if ( ( roundingMode == float_round_to_zero )
2656 || ( zSign && ( roundingMode == float_round_up ) )
2657 || ( ! zSign && ( roundingMode == float_round_down ) )
2658 ) {
2659 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2660 }
0f605c88
LV
2661 return packFloatx80(zSign,
2662 floatx80_infinity_high,
2663 floatx80_infinity_low);
158142c2
FB
2664 }
2665 if ( zExp <= 0 ) {
2666 isTiny =
a2f2d288
PM
2667 (status->float_detect_tininess
2668 == float_tininess_before_rounding)
158142c2
FB
2669 || ( zExp < 0 )
2670 || ! increment
2671 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2672 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2673 zExp = 0;
ff32e16e
PM
2674 if (isTiny && zSig1) {
2675 float_raise(float_flag_underflow, status);
2676 }
a2f2d288
PM
2677 if (zSig1) {
2678 status->float_exception_flags |= float_flag_inexact;
2679 }
dc355b76
PM
2680 switch (roundingMode) {
2681 case float_round_nearest_even:
f9288a76 2682 case float_round_ties_away:
dc355b76
PM
2683 increment = ((int64_t)zSig1 < 0);
2684 break;
2685 case float_round_to_zero:
2686 increment = 0;
2687 break;
2688 case float_round_up:
2689 increment = !zSign && zSig1;
2690 break;
2691 case float_round_down:
2692 increment = zSign && zSig1;
2693 break;
2694 default:
2695 abort();
158142c2
FB
2696 }
2697 if ( increment ) {
2698 ++zSig0;
2699 zSig0 &=
bb98fe42
AF
2700 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2701 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2702 }
2703 return packFloatx80( zSign, zExp, zSig0 );
2704 }
2705 }
a2f2d288
PM
2706 if (zSig1) {
2707 status->float_exception_flags |= float_flag_inexact;
2708 }
158142c2
FB
2709 if ( increment ) {
2710 ++zSig0;
2711 if ( zSig0 == 0 ) {
2712 ++zExp;
2713 zSig0 = LIT64( 0x8000000000000000 );
2714 }
2715 else {
bb98fe42 2716 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
2717 }
2718 }
2719 else {
2720 if ( zSig0 == 0 ) zExp = 0;
2721 }
2722 return packFloatx80( zSign, zExp, zSig0 );
2723
2724}
2725
2726/*----------------------------------------------------------------------------
2727| Takes an abstract floating-point value having sign `zSign', exponent
2728| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2729| and returns the proper extended double-precision floating-point value
2730| corresponding to the abstract input. This routine is just like
2731| `roundAndPackFloatx80' except that the input significand does not have to be
2732| normalized.
2733*----------------------------------------------------------------------------*/
2734
88857aca
LV
2735floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2736 flag zSign, int32_t zExp,
2737 uint64_t zSig0, uint64_t zSig1,
2738 float_status *status)
158142c2 2739{
8f506c70 2740 int8_t shiftCount;
158142c2
FB
2741
2742 if ( zSig0 == 0 ) {
2743 zSig0 = zSig1;
2744 zSig1 = 0;
2745 zExp -= 64;
2746 }
2747 shiftCount = countLeadingZeros64( zSig0 );
2748 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2749 zExp -= shiftCount;
ff32e16e
PM
2750 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2751 zSig0, zSig1, status);
158142c2
FB
2752
2753}
2754
158142c2
FB
2755/*----------------------------------------------------------------------------
2756| Returns the least-significant 64 fraction bits of the quadruple-precision
2757| floating-point value `a'.
2758*----------------------------------------------------------------------------*/
2759
a49db98d 2760static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
2761{
2762
2763 return a.low;
2764
2765}
2766
2767/*----------------------------------------------------------------------------
2768| Returns the most-significant 48 fraction bits of the quadruple-precision
2769| floating-point value `a'.
2770*----------------------------------------------------------------------------*/
2771
a49db98d 2772static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
2773{
2774
2775 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2776
2777}
2778
2779/*----------------------------------------------------------------------------
2780| Returns the exponent bits of the quadruple-precision floating-point value
2781| `a'.
2782*----------------------------------------------------------------------------*/
2783
f4014512 2784static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
2785{
2786
2787 return ( a.high>>48 ) & 0x7FFF;
2788
2789}
2790
2791/*----------------------------------------------------------------------------
2792| Returns the sign bit of the quadruple-precision floating-point value `a'.
2793*----------------------------------------------------------------------------*/
2794
a49db98d 2795static inline flag extractFloat128Sign( float128 a )
158142c2
FB
2796{
2797
2798 return a.high>>63;
2799
2800}
2801
2802/*----------------------------------------------------------------------------
2803| Normalizes the subnormal quadruple-precision floating-point value
2804| represented by the denormalized significand formed by the concatenation of
2805| `aSig0' and `aSig1'. The normalized exponent is stored at the location
2806| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2807| significand are stored at the location pointed to by `zSig0Ptr', and the
2808| least significant 64 bits of the normalized significand are stored at the
2809| location pointed to by `zSig1Ptr'.
2810*----------------------------------------------------------------------------*/
2811
2812static void
2813 normalizeFloat128Subnormal(
bb98fe42
AF
2814 uint64_t aSig0,
2815 uint64_t aSig1,
f4014512 2816 int32_t *zExpPtr,
bb98fe42
AF
2817 uint64_t *zSig0Ptr,
2818 uint64_t *zSig1Ptr
158142c2
FB
2819 )
2820{
8f506c70 2821 int8_t shiftCount;
158142c2
FB
2822
2823 if ( aSig0 == 0 ) {
2824 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2825 if ( shiftCount < 0 ) {
2826 *zSig0Ptr = aSig1>>( - shiftCount );
2827 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2828 }
2829 else {
2830 *zSig0Ptr = aSig1<<shiftCount;
2831 *zSig1Ptr = 0;
2832 }
2833 *zExpPtr = - shiftCount - 63;
2834 }
2835 else {
2836 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2837 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2838 *zExpPtr = 1 - shiftCount;
2839 }
2840
2841}
2842
2843/*----------------------------------------------------------------------------
2844| Packs the sign `zSign', the exponent `zExp', and the significand formed
2845| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2846| floating-point value, returning the result. After being shifted into the
2847| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2848| added together to form the most significant 32 bits of the result. This
2849| means that any integer portion of `zSig0' will be added into the exponent.
2850| Since a properly normalized significand will have an integer portion equal
2851| to 1, the `zExp' input should be 1 less than the desired result exponent
2852| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2853| significand.
2854*----------------------------------------------------------------------------*/
2855
a49db98d 2856static inline float128
f4014512 2857 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
2858{
2859 float128 z;
2860
2861 z.low = zSig1;
bb98fe42 2862 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
2863 return z;
2864
2865}
2866
2867/*----------------------------------------------------------------------------
2868| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2869| and extended significand formed by the concatenation of `zSig0', `zSig1',
2870| and `zSig2', and returns the proper quadruple-precision floating-point value
2871| corresponding to the abstract input. Ordinarily, the abstract value is
2872| simply rounded and packed into the quadruple-precision format, with the
2873| inexact exception raised if the abstract input cannot be represented
2874| exactly. However, if the abstract value is too large, the overflow and
2875| inexact exceptions are raised and an infinity or maximal finite value is
2876| returned. If the abstract value is too small, the input value is rounded to
2877| a subnormal number, and the underflow and inexact exceptions are raised if
2878| the abstract input cannot be represented exactly as a subnormal quadruple-
2879| precision floating-point number.
2880| The input significand must be normalized or smaller. If the input
2881| significand is not normalized, `zExp' must be 0; in that case, the result
2882| returned is a subnormal number, and it must not require rounding. In the
2883| usual case that the input significand is normalized, `zExp' must be 1 less
2884| than the ``true'' floating-point exponent. The handling of underflow and
2885| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2886*----------------------------------------------------------------------------*/
2887
f4014512 2888static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2889 uint64_t zSig0, uint64_t zSig1,
2890 uint64_t zSig2, float_status *status)
158142c2 2891{
8f506c70 2892 int8_t roundingMode;
158142c2
FB
2893 flag roundNearestEven, increment, isTiny;
2894
a2f2d288 2895 roundingMode = status->float_rounding_mode;
158142c2 2896 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2897 switch (roundingMode) {
2898 case float_round_nearest_even:
f9288a76 2899 case float_round_ties_away:
dc355b76
PM
2900 increment = ((int64_t)zSig2 < 0);
2901 break;
2902 case float_round_to_zero:
2903 increment = 0;
2904 break;
2905 case float_round_up:
2906 increment = !zSign && zSig2;
2907 break;
2908 case float_round_down:
2909 increment = zSign && zSig2;
2910 break;
9ee6f678
BR
2911 case float_round_to_odd:
2912 increment = !(zSig1 & 0x1) && zSig2;
2913 break;
dc355b76
PM
2914 default:
2915 abort();
158142c2 2916 }
bb98fe42 2917 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
2918 if ( ( 0x7FFD < zExp )
2919 || ( ( zExp == 0x7FFD )
2920 && eq128(
2921 LIT64( 0x0001FFFFFFFFFFFF ),
2922 LIT64( 0xFFFFFFFFFFFFFFFF ),
2923 zSig0,
2924 zSig1
2925 )
2926 && increment
2927 )
2928 ) {
ff32e16e 2929 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2930 if ( ( roundingMode == float_round_to_zero )
2931 || ( zSign && ( roundingMode == float_round_up ) )
2932 || ( ! zSign && ( roundingMode == float_round_down ) )
9ee6f678 2933 || (roundingMode == float_round_to_odd)
158142c2
FB
2934 ) {
2935 return
2936 packFloat128(
2937 zSign,
2938 0x7FFE,
2939 LIT64( 0x0000FFFFFFFFFFFF ),
2940 LIT64( 0xFFFFFFFFFFFFFFFF )
2941 );
2942 }
2943 return packFloat128( zSign, 0x7FFF, 0, 0 );
2944 }
2945 if ( zExp < 0 ) {
a2f2d288 2946 if (status->flush_to_zero) {
ff32e16e 2947 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2948 return packFloat128(zSign, 0, 0, 0);
2949 }
158142c2 2950 isTiny =
a2f2d288
PM
2951 (status->float_detect_tininess
2952 == float_tininess_before_rounding)
158142c2
FB
2953 || ( zExp < -1 )
2954 || ! increment
2955 || lt128(
2956 zSig0,
2957 zSig1,
2958 LIT64( 0x0001FFFFFFFFFFFF ),
2959 LIT64( 0xFFFFFFFFFFFFFFFF )
2960 );
2961 shift128ExtraRightJamming(
2962 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2963 zExp = 0;
ff32e16e
PM
2964 if (isTiny && zSig2) {
2965 float_raise(float_flag_underflow, status);
2966 }
dc355b76
PM
2967 switch (roundingMode) {
2968 case float_round_nearest_even:
f9288a76 2969 case float_round_ties_away:
dc355b76
PM
2970 increment = ((int64_t)zSig2 < 0);
2971 break;
2972 case float_round_to_zero:
2973 increment = 0;
2974 break;
2975 case float_round_up:
2976 increment = !zSign && zSig2;
2977 break;
2978 case float_round_down:
2979 increment = zSign && zSig2;
2980 break;
9ee6f678
BR
2981 case float_round_to_odd:
2982 increment = !(zSig1 & 0x1) && zSig2;
2983 break;
dc355b76
PM
2984 default:
2985 abort();
158142c2
FB
2986 }
2987 }
2988 }
a2f2d288
PM
2989 if (zSig2) {
2990 status->float_exception_flags |= float_flag_inexact;
2991 }
158142c2
FB
2992 if ( increment ) {
2993 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2994 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2995 }
2996 else {
2997 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2998 }
2999 return packFloat128( zSign, zExp, zSig0, zSig1 );
3000
3001}
3002
3003/*----------------------------------------------------------------------------
3004| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3005| and significand formed by the concatenation of `zSig0' and `zSig1', and
3006| returns the proper quadruple-precision floating-point value corresponding
3007| to the abstract input. This routine is just like `roundAndPackFloat128'
3008| except that the input significand has fewer bits and does not have to be
3009| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3010| point exponent.
3011*----------------------------------------------------------------------------*/
3012
f4014512 3013static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
3014 uint64_t zSig0, uint64_t zSig1,
3015 float_status *status)
158142c2 3016{
8f506c70 3017 int8_t shiftCount;
bb98fe42 3018 uint64_t zSig2;
158142c2
FB
3019
3020 if ( zSig0 == 0 ) {
3021 zSig0 = zSig1;
3022 zSig1 = 0;
3023 zExp -= 64;
3024 }
3025 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3026 if ( 0 <= shiftCount ) {
3027 zSig2 = 0;
3028 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3029 }
3030 else {
3031 shift128ExtraRightJamming(
3032 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3033 }
3034 zExp -= shiftCount;
ff32e16e 3035 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
3036
3037}
3038
158142c2 3039
158142c2
FB
3040/*----------------------------------------------------------------------------
3041| Returns the result of converting the 32-bit two's complement integer `a'
3042| to the extended double-precision floating-point format. The conversion
3043| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3044| Arithmetic.
3045*----------------------------------------------------------------------------*/
3046
e5a41ffa 3047floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
3048{
3049 flag zSign;
3a87d009 3050 uint32_t absA;
8f506c70 3051 int8_t shiftCount;
bb98fe42 3052 uint64_t zSig;
158142c2
FB
3053
3054 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3055 zSign = ( a < 0 );
3056 absA = zSign ? - a : a;
3057 shiftCount = countLeadingZeros32( absA ) + 32;
3058 zSig = absA;
3059 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3060
3061}
3062
158142c2
FB
3063/*----------------------------------------------------------------------------
3064| Returns the result of converting the 32-bit two's complement integer `a' to
3065| the quadruple-precision floating-point format. The conversion is performed
3066| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3067*----------------------------------------------------------------------------*/
3068
e5a41ffa 3069float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
3070{
3071 flag zSign;
3a87d009 3072 uint32_t absA;
8f506c70 3073 int8_t shiftCount;
bb98fe42 3074 uint64_t zSig0;
158142c2
FB
3075
3076 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3077 zSign = ( a < 0 );
3078 absA = zSign ? - a : a;
3079 shiftCount = countLeadingZeros32( absA ) + 17;
3080 zSig0 = absA;
3081 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3082
3083}
3084
158142c2
FB
3085/*----------------------------------------------------------------------------
3086| Returns the result of converting the 64-bit two's complement integer `a'
3087| to the extended double-precision floating-point format. The conversion
3088| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3089| Arithmetic.
3090*----------------------------------------------------------------------------*/
3091
e5a41ffa 3092floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
3093{
3094 flag zSign;
182f42fd 3095 uint64_t absA;
8f506c70 3096 int8_t shiftCount;
158142c2
FB
3097
3098 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3099 zSign = ( a < 0 );
3100 absA = zSign ? - a : a;
3101 shiftCount = countLeadingZeros64( absA );
3102 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3103
3104}
3105
158142c2
FB
3106/*----------------------------------------------------------------------------
3107| Returns the result of converting the 64-bit two's complement integer `a' to
3108| the quadruple-precision floating-point format. The conversion is performed
3109| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3110*----------------------------------------------------------------------------*/
3111
e5a41ffa 3112float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
3113{
3114 flag zSign;
182f42fd 3115 uint64_t absA;
8f506c70 3116 int8_t shiftCount;
f4014512 3117 int32_t zExp;
bb98fe42 3118 uint64_t zSig0, zSig1;
158142c2
FB
3119
3120 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3121 zSign = ( a < 0 );
3122 absA = zSign ? - a : a;
3123 shiftCount = countLeadingZeros64( absA ) + 49;
3124 zExp = 0x406E - shiftCount;
3125 if ( 64 <= shiftCount ) {
3126 zSig1 = 0;
3127 zSig0 = absA;
3128 shiftCount -= 64;
3129 }
3130 else {
3131 zSig1 = absA;
3132 zSig0 = 0;
3133 }
3134 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3135 return packFloat128( zSign, zExp, zSig0, zSig1 );
3136
3137}
3138
6bb8e0f1
PM
3139/*----------------------------------------------------------------------------
3140| Returns the result of converting the 64-bit unsigned integer `a'
3141| to the quadruple-precision floating-point format. The conversion is performed
3142| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3143*----------------------------------------------------------------------------*/
3144
e5a41ffa 3145float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
3146{
3147 if (a == 0) {
3148 return float128_zero;
3149 }
ff32e16e 3150 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1e397ead
RH
3151}
3152
158142c2 3153
158142c2 3154
158142c2
FB
3155
3156/*----------------------------------------------------------------------------
3157| Returns the result of converting the single-precision floating-point value
3158| `a' to the double-precision floating-point format. The conversion is
3159| performed according to the IEC/IEEE Standard for Binary Floating-Point
3160| Arithmetic.
3161*----------------------------------------------------------------------------*/
3162
e5a41ffa 3163float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
3164{
3165 flag aSign;
0c48262d 3166 int aExp;
bb98fe42 3167 uint32_t aSig;
ff32e16e 3168 a = float32_squash_input_denormal(a, status);
158142c2
FB
3169
3170 aSig = extractFloat32Frac( a );
3171 aExp = extractFloat32Exp( a );
3172 aSign = extractFloat32Sign( a );
3173 if ( aExp == 0xFF ) {
ff32e16e
PM
3174 if (aSig) {
3175 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3176 }
158142c2
FB
3177 return packFloat64( aSign, 0x7FF, 0 );
3178 }
3179 if ( aExp == 0 ) {
3180 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3181 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3182 --aExp;
3183 }
bb98fe42 3184 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
3185
3186}
3187
158142c2
FB
3188/*----------------------------------------------------------------------------
3189| Returns the result of converting the single-precision floating-point value
3190| `a' to the extended double-precision floating-point format. The conversion
3191| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3192| Arithmetic.
3193*----------------------------------------------------------------------------*/
3194
e5a41ffa 3195floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
3196{
3197 flag aSign;
0c48262d 3198 int aExp;
bb98fe42 3199 uint32_t aSig;
158142c2 3200
ff32e16e 3201 a = float32_squash_input_denormal(a, status);
158142c2
FB
3202 aSig = extractFloat32Frac( a );
3203 aExp = extractFloat32Exp( a );
3204 aSign = extractFloat32Sign( a );
3205 if ( aExp == 0xFF ) {
ff32e16e
PM
3206 if (aSig) {
3207 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3208 }
0f605c88
LV
3209 return packFloatx80(aSign,
3210 floatx80_infinity_high,
3211 floatx80_infinity_low);
158142c2
FB
3212 }
3213 if ( aExp == 0 ) {
3214 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3215 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3216 }
3217 aSig |= 0x00800000;
bb98fe42 3218 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
3219
3220}
3221
158142c2
FB
3222/*----------------------------------------------------------------------------
3223| Returns the result of converting the single-precision floating-point value
3224| `a' to the double-precision floating-point format. The conversion is
3225| performed according to the IEC/IEEE Standard for Binary Floating-Point
3226| Arithmetic.
3227*----------------------------------------------------------------------------*/
3228
e5a41ffa 3229float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
3230{
3231 flag aSign;
0c48262d 3232 int aExp;
bb98fe42 3233 uint32_t aSig;
158142c2 3234
ff32e16e 3235 a = float32_squash_input_denormal(a, status);
158142c2
FB
3236 aSig = extractFloat32Frac( a );
3237 aExp = extractFloat32Exp( a );
3238 aSign = extractFloat32Sign( a );
3239 if ( aExp == 0xFF ) {
ff32e16e
PM
3240 if (aSig) {
3241 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3242 }
158142c2
FB
3243 return packFloat128( aSign, 0x7FFF, 0, 0 );
3244 }
3245 if ( aExp == 0 ) {
3246 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3247 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3248 --aExp;
3249 }
bb98fe42 3250 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
3251
3252}
3253
158142c2
FB
3254/*----------------------------------------------------------------------------
3255| Returns the remainder of the single-precision floating-point value `a'
3256| with respect to the corresponding value `b'. The operation is performed
3257| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3258*----------------------------------------------------------------------------*/
3259
e5a41ffa 3260float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 3261{
ed086f3d 3262 flag aSign, zSign;
0c48262d 3263 int aExp, bExp, expDiff;
bb98fe42
AF
3264 uint32_t aSig, bSig;
3265 uint32_t q;
3266 uint64_t aSig64, bSig64, q64;
3267 uint32_t alternateASig;
3268 int32_t sigMean;
ff32e16e
PM
3269 a = float32_squash_input_denormal(a, status);
3270 b = float32_squash_input_denormal(b, status);
158142c2
FB
3271
3272 aSig = extractFloat32Frac( a );
3273 aExp = extractFloat32Exp( a );
3274 aSign = extractFloat32Sign( a );
3275 bSig = extractFloat32Frac( b );
3276 bExp = extractFloat32Exp( b );
158142c2
FB
3277 if ( aExp == 0xFF ) {
3278 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 3279 return propagateFloat32NaN(a, b, status);
158142c2 3280 }
ff32e16e 3281 float_raise(float_flag_invalid, status);
af39bc8c 3282 return float32_default_nan(status);
158142c2
FB
3283 }
3284 if ( bExp == 0xFF ) {
ff32e16e
PM
3285 if (bSig) {
3286 return propagateFloat32NaN(a, b, status);
3287 }
158142c2
FB
3288 return a;
3289 }
3290 if ( bExp == 0 ) {
3291 if ( bSig == 0 ) {
ff32e16e 3292 float_raise(float_flag_invalid, status);
af39bc8c 3293 return float32_default_nan(status);
158142c2
FB
3294 }
3295 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3296 }
3297 if ( aExp == 0 ) {
3298 if ( aSig == 0 ) return a;
3299 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3300 }
3301 expDiff = aExp - bExp;
3302 aSig |= 0x00800000;
3303 bSig |= 0x00800000;
3304 if ( expDiff < 32 ) {
3305 aSig <<= 8;
3306 bSig <<= 8;
3307 if ( expDiff < 0 ) {
3308 if ( expDiff < -1 ) return a;
3309 aSig >>= 1;
3310 }
3311 q = ( bSig <= aSig );
3312 if ( q ) aSig -= bSig;
3313 if ( 0 < expDiff ) {
bb98fe42 3314 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
3315 q >>= 32 - expDiff;
3316 bSig >>= 2;
3317 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3318 }
3319 else {
3320 aSig >>= 2;
3321 bSig >>= 2;
3322 }
3323 }
3324 else {
3325 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
3326 aSig64 = ( (uint64_t) aSig )<<40;
3327 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
3328 expDiff -= 64;
3329 while ( 0 < expDiff ) {
3330 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3331 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3332 aSig64 = - ( ( bSig * q64 )<<38 );
3333 expDiff -= 62;
3334 }
3335 expDiff += 64;
3336 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3337 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3338 q = q64>>( 64 - expDiff );
3339 bSig <<= 6;
3340 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3341 }
3342 do {
3343 alternateASig = aSig;
3344 ++q;
3345 aSig -= bSig;
bb98fe42 3346 } while ( 0 <= (int32_t) aSig );
158142c2
FB
3347 sigMean = aSig + alternateASig;
3348 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3349 aSig = alternateASig;
3350 }
bb98fe42 3351 zSign = ( (int32_t) aSig < 0 );
158142c2 3352 if ( zSign ) aSig = - aSig;
ff32e16e 3353 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
3354}
3355
369be8f6 3356
158142c2 3357
8229c991
AJ
3358/*----------------------------------------------------------------------------
3359| Returns the binary exponential of the single-precision floating-point value
3360| `a'. The operation is performed according to the IEC/IEEE Standard for
3361| Binary Floating-Point Arithmetic.
3362|
3363| Uses the following identities:
3364|
3365| 1. -------------------------------------------------------------------------
3366| x x*ln(2)
3367| 2 = e
3368|
3369| 2. -------------------------------------------------------------------------
3370| 2 3 4 5 n
3371| x x x x x x x
3372| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3373| 1! 2! 3! 4! 5! n!
3374*----------------------------------------------------------------------------*/
3375
3376static const float64 float32_exp2_coefficients[15] =
3377{
d5138cf4
PM
3378 const_float64( 0x3ff0000000000000ll ), /* 1 */
3379 const_float64( 0x3fe0000000000000ll ), /* 2 */
3380 const_float64( 0x3fc5555555555555ll ), /* 3 */
3381 const_float64( 0x3fa5555555555555ll ), /* 4 */
3382 const_float64( 0x3f81111111111111ll ), /* 5 */
3383 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3384 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3385 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3386 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3387 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3388 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3389 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3390 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3391 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3392 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
3393};
3394
e5a41ffa 3395float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
3396{
3397 flag aSign;
0c48262d 3398 int aExp;
bb98fe42 3399 uint32_t aSig;
8229c991
AJ
3400 float64 r, x, xn;
3401 int i;
ff32e16e 3402 a = float32_squash_input_denormal(a, status);
8229c991
AJ
3403
3404 aSig = extractFloat32Frac( a );
3405 aExp = extractFloat32Exp( a );
3406 aSign = extractFloat32Sign( a );
3407
3408 if ( aExp == 0xFF) {
ff32e16e
PM
3409 if (aSig) {
3410 return propagateFloat32NaN(a, float32_zero, status);
3411 }
8229c991
AJ
3412 return (aSign) ? float32_zero : a;
3413 }
3414 if (aExp == 0) {
3415 if (aSig == 0) return float32_one;
3416 }
3417
ff32e16e 3418 float_raise(float_flag_inexact, status);
8229c991
AJ
3419
3420 /* ******************************* */
3421 /* using float64 for approximation */
3422 /* ******************************* */
ff32e16e
PM
3423 x = float32_to_float64(a, status);
3424 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
3425
3426 xn = x;
3427 r = float64_one;
3428 for (i = 0 ; i < 15 ; i++) {
3429 float64 f;
3430
ff32e16e
PM
3431 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3432 r = float64_add(r, f, status);
8229c991 3433
ff32e16e 3434 xn = float64_mul(xn, x, status);
8229c991
AJ
3435 }
3436
3437 return float64_to_float32(r, status);
3438}
3439
374dfc33
AJ
3440/*----------------------------------------------------------------------------
3441| Returns the binary log of the single-precision floating-point value `a'.
3442| The operation is performed according to the IEC/IEEE Standard for Binary
3443| Floating-Point Arithmetic.
3444*----------------------------------------------------------------------------*/
e5a41ffa 3445float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
3446{
3447 flag aSign, zSign;
0c48262d 3448 int aExp;
bb98fe42 3449 uint32_t aSig, zSig, i;
374dfc33 3450
ff32e16e 3451 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
3452 aSig = extractFloat32Frac( a );
3453 aExp = extractFloat32Exp( a );
3454 aSign = extractFloat32Sign( a );
3455
3456 if ( aExp == 0 ) {
3457 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3458 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3459 }
3460 if ( aSign ) {
ff32e16e 3461 float_raise(float_flag_invalid, status);
af39bc8c 3462 return float32_default_nan(status);
374dfc33
AJ
3463 }
3464 if ( aExp == 0xFF ) {
ff32e16e
PM
3465 if (aSig) {
3466 return propagateFloat32NaN(a, float32_zero, status);
3467 }
374dfc33
AJ
3468 return a;
3469 }
3470
3471 aExp -= 0x7F;
3472 aSig |= 0x00800000;
3473 zSign = aExp < 0;
3474 zSig = aExp << 23;
3475
3476 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 3477 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
3478 if ( aSig & 0x01000000 ) {
3479 aSig >>= 1;
3480 zSig |= i;
3481 }
3482 }
3483
3484 if ( zSign )
3485 zSig = -zSig;
3486
ff32e16e 3487 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
3488}
3489
158142c2
FB
3490/*----------------------------------------------------------------------------
3491| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
3492| the corresponding value `b', and 0 otherwise. The invalid exception is
3493| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3494| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3495*----------------------------------------------------------------------------*/
3496
e5a41ffa 3497int float32_eq(float32 a, float32 b, float_status *status)
158142c2 3498{
b689362d 3499 uint32_t av, bv;
ff32e16e
PM
3500 a = float32_squash_input_denormal(a, status);
3501 b = float32_squash_input_denormal(b, status);
158142c2
FB
3502
3503 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3504 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3505 ) {
ff32e16e 3506 float_raise(float_flag_invalid, status);
158142c2
FB
3507 return 0;
3508 }
b689362d
AJ
3509 av = float32_val(a);
3510 bv = float32_val(b);
3511 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3512}
3513
3514/*----------------------------------------------------------------------------
3515| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3516| or equal to the corresponding value `b', and 0 otherwise. The invalid
3517| exception is raised if either operand is a NaN. The comparison is performed
3518| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3519*----------------------------------------------------------------------------*/
3520
e5a41ffa 3521int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
3522{
3523 flag aSign, bSign;
bb98fe42 3524 uint32_t av, bv;
ff32e16e
PM
3525 a = float32_squash_input_denormal(a, status);
3526 b = float32_squash_input_denormal(b, status);
158142c2
FB
3527
3528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3530 ) {
ff32e16e 3531 float_raise(float_flag_invalid, status);
158142c2
FB
3532 return 0;
3533 }
3534 aSign = extractFloat32Sign( a );
3535 bSign = extractFloat32Sign( b );
f090c9d4
PB
3536 av = float32_val(a);
3537 bv = float32_val(b);
bb98fe42 3538 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3539 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3540
3541}
3542
3543/*----------------------------------------------------------------------------
3544| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3545| the corresponding value `b', and 0 otherwise. The invalid exception is
3546| raised if either operand is a NaN. The comparison is performed according
3547| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3548*----------------------------------------------------------------------------*/
3549
e5a41ffa 3550int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
3551{
3552 flag aSign, bSign;
bb98fe42 3553 uint32_t av, bv;
ff32e16e
PM
3554 a = float32_squash_input_denormal(a, status);
3555 b = float32_squash_input_denormal(b, status);
158142c2
FB
3556
3557 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3558 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3559 ) {
ff32e16e 3560 float_raise(float_flag_invalid, status);
158142c2
FB
3561 return 0;
3562 }
3563 aSign = extractFloat32Sign( a );
3564 bSign = extractFloat32Sign( b );
f090c9d4
PB
3565 av = float32_val(a);
3566 bv = float32_val(b);
bb98fe42 3567 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3568 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3569
3570}
3571
67b7861d
AJ
3572/*----------------------------------------------------------------------------
3573| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3574| be compared, and 0 otherwise. The invalid exception is raised if either
3575| operand is a NaN. The comparison is performed according to the IEC/IEEE
3576| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3577*----------------------------------------------------------------------------*/
3578
e5a41ffa 3579int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 3580{
ff32e16e
PM
3581 a = float32_squash_input_denormal(a, status);
3582 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3583
3584 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3585 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3586 ) {
ff32e16e 3587 float_raise(float_flag_invalid, status);
67b7861d
AJ
3588 return 1;
3589 }
3590 return 0;
3591}
b689362d 3592
158142c2
FB
3593/*----------------------------------------------------------------------------
3594| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
3595| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3596| exception. The comparison is performed according to the IEC/IEEE Standard
3597| for Binary Floating-Point Arithmetic.
158142c2
FB
3598*----------------------------------------------------------------------------*/
3599
e5a41ffa 3600int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 3601{
ff32e16e
PM
3602 a = float32_squash_input_denormal(a, status);
3603 b = float32_squash_input_denormal(b, status);
158142c2
FB
3604
3605 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3606 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3607 ) {
af39bc8c
AM
3608 if (float32_is_signaling_nan(a, status)
3609 || float32_is_signaling_nan(b, status)) {
ff32e16e 3610 float_raise(float_flag_invalid, status);
b689362d 3611 }
158142c2
FB
3612 return 0;
3613 }
b689362d
AJ
3614 return ( float32_val(a) == float32_val(b) ) ||
3615 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
3616}
3617
3618/*----------------------------------------------------------------------------
3619| Returns 1 if the single-precision floating-point value `a' is less than or
3620| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3621| cause an exception. Otherwise, the comparison is performed according to the
3622| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3623*----------------------------------------------------------------------------*/
3624
e5a41ffa 3625int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3626{
3627 flag aSign, bSign;
bb98fe42 3628 uint32_t av, bv;
ff32e16e
PM
3629 a = float32_squash_input_denormal(a, status);
3630 b = float32_squash_input_denormal(b, status);
158142c2
FB
3631
3632 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3633 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3634 ) {
af39bc8c
AM
3635 if (float32_is_signaling_nan(a, status)
3636 || float32_is_signaling_nan(b, status)) {
ff32e16e 3637 float_raise(float_flag_invalid, status);
158142c2
FB
3638 }
3639 return 0;
3640 }
3641 aSign = extractFloat32Sign( a );
3642 bSign = extractFloat32Sign( b );
f090c9d4
PB
3643 av = float32_val(a);
3644 bv = float32_val(b);
bb98fe42 3645 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3646 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3647
3648}
3649
3650/*----------------------------------------------------------------------------
3651| Returns 1 if the single-precision floating-point value `a' is less than
3652| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3653| exception. Otherwise, the comparison is performed according to the IEC/IEEE
ab52f973 3654| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3655*----------------------------------------------------------------------------*/
3656
ab52f973 3657int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2 3658{
ab52f973
AB
3659 flag aSign, bSign;
3660 uint32_t av, bv;
3661 a = float32_squash_input_denormal(a, status);
3662 b = float32_squash_input_denormal(b, status);
158142c2 3663
ab52f973
AB
3664 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3665 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3666 ) {
3667 if (float32_is_signaling_nan(a, status)
3668 || float32_is_signaling_nan(b, status)) {
ff32e16e 3669 float_raise(float_flag_invalid, status);
158142c2 3670 }
ab52f973 3671 return 0;
158142c2 3672 }
ab52f973
AB
3673 aSign = extractFloat32Sign( a );
3674 bSign = extractFloat32Sign( b );
3675 av = float32_val(a);
3676 bv = float32_val(b);
3677 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3678 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3679
3680}
3681
3682/*----------------------------------------------------------------------------
ab52f973
AB
3683| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3684| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3685| comparison is performed according to the IEC/IEEE Standard for Binary
3686| Floating-Point Arithmetic.
158142c2
FB
3687*----------------------------------------------------------------------------*/
3688
ab52f973 3689int float32_unordered_quiet(float32 a, float32 b, float_status *status)
158142c2 3690{
ab52f973
AB
3691 a = float32_squash_input_denormal(a, status);
3692 b = float32_squash_input_denormal(b, status);
158142c2 3693
ab52f973
AB
3694 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3695 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3696 ) {
3697 if (float32_is_signaling_nan(a, status)
3698 || float32_is_signaling_nan(b, status)) {
3699 float_raise(float_flag_invalid, status);
158142c2 3700 }
ab52f973 3701 return 1;
158142c2 3702 }
ab52f973 3703 return 0;
158142c2
FB
3704}
3705
ab52f973 3706
158142c2
FB
3707/*----------------------------------------------------------------------------
3708| Returns the result of converting the double-precision floating-point value
3709| `a' to the single-precision floating-point format. The conversion is
3710| performed according to the IEC/IEEE Standard for Binary Floating-Point
3711| Arithmetic.
3712*----------------------------------------------------------------------------*/
3713
e5a41ffa 3714float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3715{
3716 flag aSign;
0c48262d 3717 int aExp;
bb98fe42
AF
3718 uint64_t aSig;
3719 uint32_t zSig;
ff32e16e 3720 a = float64_squash_input_denormal(a, status);
158142c2
FB
3721
3722 aSig = extractFloat64Frac( a );
3723 aExp = extractFloat64Exp( a );
3724 aSign = extractFloat64Sign( a );
3725 if ( aExp == 0x7FF ) {
ff32e16e
PM
3726 if (aSig) {
3727 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3728 }
158142c2
FB
3729 return packFloat32( aSign, 0xFF, 0 );
3730 }
3731 shift64RightJamming( aSig, 22, &aSig );
3732 zSig = aSig;
3733 if ( aExp || zSig ) {
3734 zSig |= 0x40000000;
3735 aExp -= 0x381;
3736 }
ff32e16e 3737 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3738
3739}
3740
60011498
PB
3741
3742/*----------------------------------------------------------------------------
3743| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3744| half-precision floating-point value, returning the result. After being
3745| shifted into the proper positions, the three fields are simply added
3746| together to form the result. This means that any integer portion of `zSig'
3747| will be added into the exponent. Since a properly normalized significand
3748| will have an integer portion equal to 1, the `zExp' input should be 1 less
3749| than the desired result exponent whenever `zSig' is a complete, normalized
3750| significand.
3751*----------------------------------------------------------------------------*/
0c48262d 3752static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
60011498 3753{
bb4d4bb3 3754 return make_float16(
bb98fe42 3755 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3756}
3757
c4a1c5e7
PM
3758/*----------------------------------------------------------------------------
3759| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3760| and significand `zSig', and returns the proper half-precision floating-
3761| point value corresponding to the abstract input. Ordinarily, the abstract
3762| value is simply rounded and packed into the half-precision format, with
3763| the inexact exception raised if the abstract input cannot be represented
3764| exactly. However, if the abstract value is too large, the overflow and
3765| inexact exceptions are raised and an infinity or maximal finite value is
3766| returned. If the abstract value is too small, the input value is rounded to
3767| a subnormal number, and the underflow and inexact exceptions are raised if
3768| the abstract input cannot be represented exactly as a subnormal half-
3769| precision floating-point number.
3770| The `ieee' flag indicates whether to use IEEE standard half precision, or
3771| ARM-style "alternative representation", which omits the NaN and Inf
3772| encodings in order to raise the maximum representable exponent by one.
3773| The input significand `zSig' has its binary point between bits 22
3774| and 23, which is 13 bits to the left of the usual location. This shifted
3775| significand must be normalized or smaller. If `zSig' is not normalized,
3776| `zExp' must be 0; in that case, the result returned is a subnormal number,
3777| and it must not require rounding. In the usual case that `zSig' is
3778| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3779| Note the slightly odd position of the binary point in zSig compared with the
3780| other roundAndPackFloat functions. This should probably be fixed if we
3781| need to implement more float16 routines than just conversion.
3782| The handling of underflow and overflow follows the IEC/IEEE Standard for
3783| Binary Floating-Point Arithmetic.
3784*----------------------------------------------------------------------------*/
3785
0c48262d 3786static float16 roundAndPackFloat16(flag zSign, int zExp,
e5a41ffa
PM
3787 uint32_t zSig, flag ieee,
3788 float_status *status)
c4a1c5e7
PM
3789{
3790 int maxexp = ieee ? 29 : 30;
3791 uint32_t mask;
3792 uint32_t increment;
c4a1c5e7
PM
3793 bool rounding_bumps_exp;
3794 bool is_tiny = false;
3795
3796 /* Calculate the mask of bits of the mantissa which are not
3797 * representable in half-precision and will be lost.
3798 */
3799 if (zExp < 1) {
3800 /* Will be denormal in halfprec */
3801 mask = 0x00ffffff;
3802 if (zExp >= -11) {
3803 mask >>= 11 + zExp;
3804 }
3805 } else {
3806 /* Normal number in halfprec */
3807 mask = 0x00001fff;
3808 }
3809
a2f2d288 3810 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3811 case float_round_nearest_even:
3812 increment = (mask + 1) >> 1;
3813 if ((zSig & mask) == increment) {
3814 increment = zSig & (increment << 1);
3815 }
3816 break;
f9288a76
PM
3817 case float_round_ties_away:
3818 increment = (mask + 1) >> 1;
3819 break;
c4a1c5e7
PM
3820 case float_round_up:
3821 increment = zSign ? 0 : mask;
3822 break;
3823 case float_round_down:
3824 increment = zSign ? mask : 0;
3825 break;
3826 default: /* round_to_zero */
3827 increment = 0;
3828 break;
3829 }
3830
3831 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3832
3833 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3834 if (ieee) {
ff32e16e 3835 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3836 return packFloat16(zSign, 0x1f, 0);
3837 } else {
ff32e16e 3838 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3839 return packFloat16(zSign, 0x1f, 0x3ff);
3840 }
3841 }
3842
3843 if (zExp < 0) {
3844 /* Note that flush-to-zero does not affect half-precision results */
3845 is_tiny =
a2f2d288 3846 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3847 || (zExp < -1)
3848 || (!rounding_bumps_exp);
3849 }
3850 if (zSig & mask) {
ff32e16e 3851 float_raise(float_flag_inexact, status);
c4a1c5e7 3852 if (is_tiny) {
ff32e16e 3853 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3854 }
3855 }
3856
3857 zSig += increment;
3858 if (rounding_bumps_exp) {
3859 zSig >>= 1;
3860 zExp++;
3861 }
3862
3863 if (zExp < -10) {
3864 return packFloat16(zSign, 0, 0);
3865 }
3866 if (zExp < 0) {
3867 zSig >>= -zExp;
3868 zExp = 0;
3869 }
3870 return packFloat16(zSign, zExp, zSig >> 13);
3871}
3872
210cbd49
AB
3873/*----------------------------------------------------------------------------
3874| If `a' is denormal and we are in flush-to-zero mode then set the
3875| input-denormal exception and return zero. Otherwise just return the value.
3876*----------------------------------------------------------------------------*/
3877float16 float16_squash_input_denormal(float16 a, float_status *status)
3878{
3879 if (status->flush_inputs_to_zero) {
3880 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3881 float_raise(float_flag_input_denormal, status);
3882 return make_float16(float16_val(a) & 0x8000);
3883 }
3884 }
3885 return a;
3886}
3887
0c48262d 3888static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
c4a1c5e7
PM
3889 uint32_t *zSigPtr)
3890{
3891 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3892 *zSigPtr = aSig << shiftCount;
3893 *zExpPtr = 1 - shiftCount;
3894}
3895
60011498
PB
3896/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3897 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3898
e5a41ffa 3899float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3900{
3901 flag aSign;
0c48262d 3902 int aExp;
bb98fe42 3903 uint32_t aSig;
60011498 3904
bb4d4bb3
PM
3905 aSign = extractFloat16Sign(a);
3906 aExp = extractFloat16Exp(a);
3907 aSig = extractFloat16Frac(a);
60011498
PB
3908
3909 if (aExp == 0x1f && ieee) {
3910 if (aSig) {
ff32e16e 3911 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3912 }
4be8eeac 3913 return packFloat32(aSign, 0xff, 0);
60011498
PB
3914 }
3915 if (aExp == 0) {
60011498
PB
3916 if (aSig == 0) {
3917 return packFloat32(aSign, 0, 0);
3918 }
3919
c4a1c5e7
PM
3920 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3921 aExp--;
60011498
PB
3922 }
3923 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3924}
3925
e5a41ffa 3926float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3927{
3928 flag aSign;
0c48262d 3929 int aExp;
bb98fe42 3930 uint32_t aSig;
38970efa 3931
ff32e16e 3932 a = float32_squash_input_denormal(a, status);
60011498
PB
3933
3934 aSig = extractFloat32Frac( a );
3935 aExp = extractFloat32Exp( a );
3936 aSign = extractFloat32Sign( a );
3937 if ( aExp == 0xFF ) {
3938 if (aSig) {
600e30d2 3939 /* Input is a NaN */
600e30d2 3940 if (!ieee) {
ff32e16e 3941 float_raise(float_flag_invalid, status);
600e30d2
PM
3942 return packFloat16(aSign, 0, 0);
3943 }
38970efa 3944 return commonNaNToFloat16(
ff32e16e 3945 float32ToCommonNaN(a, status), status);
60011498 3946 }
600e30d2
PM
3947 /* Infinity */
3948 if (!ieee) {
ff32e16e 3949 float_raise(float_flag_invalid, status);
600e30d2
PM
3950 return packFloat16(aSign, 0x1f, 0x3ff);
3951 }
3952 return packFloat16(aSign, 0x1f, 0);
60011498 3953 }
600e30d2 3954 if (aExp == 0 && aSig == 0) {
60011498
PB
3955 return packFloat16(aSign, 0, 0);
3956 }
38970efa
PM
3957 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3958 * even if the input is denormal; however this is harmless because
3959 * the largest possible single-precision denormal is still smaller
3960 * than the smallest representable half-precision denormal, and so we
3961 * will end up ignoring aSig and returning via the "always return zero"
3962 * codepath.
3963 */
60011498 3964 aSig |= 0x00800000;
c4a1c5e7 3965 aExp -= 0x71;
60011498 3966
ff32e16e 3967 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3968}
3969
e5a41ffa 3970float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3971{
3972 flag aSign;
0c48262d 3973 int aExp;
14c9a07e
PM
3974 uint32_t aSig;
3975
3976 aSign = extractFloat16Sign(a);
3977 aExp = extractFloat16Exp(a);
3978 aSig = extractFloat16Frac(a);
3979
3980 if (aExp == 0x1f && ieee) {
3981 if (aSig) {
3982 return commonNaNToFloat64(
ff32e16e 3983 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3984 }
3985 return packFloat64(aSign, 0x7ff, 0);
3986 }
3987 if (aExp == 0) {
3988 if (aSig == 0) {
3989 return packFloat64(aSign, 0, 0);
3990 }
3991
3992 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3993 aExp--;
3994 }
3995 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3996}
3997
e5a41ffa 3998float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3999{
4000 flag aSign;
0c48262d 4001 int aExp;
14c9a07e
PM
4002 uint64_t aSig;
4003 uint32_t zSig;
4004
ff32e16e 4005 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
4006
4007 aSig = extractFloat64Frac(a);
4008 aExp = extractFloat64Exp(a);
4009 aSign = extractFloat64Sign(a);
4010 if (aExp == 0x7FF) {
4011 if (aSig) {
4012 /* Input is a NaN */
4013 if (!ieee) {
ff32e16e 4014 float_raise(float_flag_invalid, status);
14c9a07e
PM
4015 return packFloat16(aSign, 0, 0);
4016 }
4017 return commonNaNToFloat16(
ff32e16e 4018 float64ToCommonNaN(a, status), status);
14c9a07e
PM
4019 }
4020 /* Infinity */
4021 if (!ieee) {
ff32e16e 4022 float_raise(float_flag_invalid, status);
14c9a07e
PM
4023 return packFloat16(aSign, 0x1f, 0x3ff);
4024 }
4025 return packFloat16(aSign, 0x1f, 0);
4026 }
4027 shift64RightJamming(aSig, 29, &aSig);
4028 zSig = aSig;
4029 if (aExp == 0 && zSig == 0) {
4030 return packFloat16(aSign, 0, 0);
4031 }
4032 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4033 * even if the input is denormal; however this is harmless because
4034 * the largest possible single-precision denormal is still smaller
4035 * than the smallest representable half-precision denormal, and so we
4036 * will end up ignoring aSig and returning via the "always return zero"
4037 * codepath.
4038 */
4039 zSig |= 0x00800000;
4040 aExp -= 0x3F1;
4041
ff32e16e 4042 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
4043}
4044
158142c2
FB
4045/*----------------------------------------------------------------------------
4046| Returns the result of converting the double-precision floating-point value
4047| `a' to the extended double-precision floating-point format. The conversion
4048| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4049| Arithmetic.
4050*----------------------------------------------------------------------------*/
4051
e5a41ffa 4052floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
4053{
4054 flag aSign;
0c48262d 4055 int aExp;
bb98fe42 4056 uint64_t aSig;
158142c2 4057
ff32e16e 4058 a = float64_squash_input_denormal(a, status);
158142c2
FB
4059 aSig = extractFloat64Frac( a );
4060 aExp = extractFloat64Exp( a );
4061 aSign = extractFloat64Sign( a );
4062 if ( aExp == 0x7FF ) {
ff32e16e
PM
4063 if (aSig) {
4064 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4065 }
0f605c88
LV
4066 return packFloatx80(aSign,
4067 floatx80_infinity_high,
4068 floatx80_infinity_low);
158142c2
FB
4069 }
4070 if ( aExp == 0 ) {
4071 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4072 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4073 }
4074 return
4075 packFloatx80(
4076 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4077
4078}
4079
158142c2
FB
4080/*----------------------------------------------------------------------------
4081| Returns the result of converting the double-precision floating-point value
4082| `a' to the quadruple-precision floating-point format. The conversion is
4083| performed according to the IEC/IEEE Standard for Binary Floating-Point
4084| Arithmetic.
4085*----------------------------------------------------------------------------*/
4086
e5a41ffa 4087float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
4088{
4089 flag aSign;
0c48262d 4090 int aExp;
bb98fe42 4091 uint64_t aSig, zSig0, zSig1;
158142c2 4092
ff32e16e 4093 a = float64_squash_input_denormal(a, status);
158142c2
FB
4094 aSig = extractFloat64Frac( a );
4095 aExp = extractFloat64Exp( a );
4096 aSign = extractFloat64Sign( a );
4097 if ( aExp == 0x7FF ) {
ff32e16e
PM
4098 if (aSig) {
4099 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4100 }
158142c2
FB
4101 return packFloat128( aSign, 0x7FFF, 0, 0 );
4102 }
4103 if ( aExp == 0 ) {
4104 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4105 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4106 --aExp;
4107 }
4108 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4109 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4110
4111}
4112
158142c2
FB
4113
4114/*----------------------------------------------------------------------------
4115| Returns the remainder of the double-precision floating-point value `a'
4116| with respect to the corresponding value `b'. The operation is performed
4117| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4118*----------------------------------------------------------------------------*/
4119
e5a41ffa 4120float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4121{
ed086f3d 4122 flag aSign, zSign;
0c48262d 4123 int aExp, bExp, expDiff;
bb98fe42
AF
4124 uint64_t aSig, bSig;
4125 uint64_t q, alternateASig;
4126 int64_t sigMean;
158142c2 4127
ff32e16e
PM
4128 a = float64_squash_input_denormal(a, status);
4129 b = float64_squash_input_denormal(b, status);
158142c2
FB
4130 aSig = extractFloat64Frac( a );
4131 aExp = extractFloat64Exp( a );
4132 aSign = extractFloat64Sign( a );
4133 bSig = extractFloat64Frac( b );
4134 bExp = extractFloat64Exp( b );
158142c2
FB
4135 if ( aExp == 0x7FF ) {
4136 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4137 return propagateFloat64NaN(a, b, status);
158142c2 4138 }
ff32e16e 4139 float_raise(float_flag_invalid, status);
af39bc8c 4140 return float64_default_nan(status);
158142c2
FB
4141 }
4142 if ( bExp == 0x7FF ) {
ff32e16e
PM
4143 if (bSig) {
4144 return propagateFloat64NaN(a, b, status);
4145 }
158142c2
FB
4146 return a;
4147 }
4148 if ( bExp == 0 ) {
4149 if ( bSig == 0 ) {
ff32e16e 4150 float_raise(float_flag_invalid, status);
af39bc8c 4151 return float64_default_nan(status);
158142c2
FB
4152 }
4153 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4154 }
4155 if ( aExp == 0 ) {
4156 if ( aSig == 0 ) return a;
4157 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4158 }
4159 expDiff = aExp - bExp;
4160 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4161 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4162 if ( expDiff < 0 ) {
4163 if ( expDiff < -1 ) return a;
4164 aSig >>= 1;
4165 }
4166 q = ( bSig <= aSig );
4167 if ( q ) aSig -= bSig;
4168 expDiff -= 64;
4169 while ( 0 < expDiff ) {
4170 q = estimateDiv128To64( aSig, 0, bSig );
4171 q = ( 2 < q ) ? q - 2 : 0;
4172 aSig = - ( ( bSig>>2 ) * q );
4173 expDiff -= 62;
4174 }
4175 expDiff += 64;
4176 if ( 0 < expDiff ) {
4177 q = estimateDiv128To64( aSig, 0, bSig );
4178 q = ( 2 < q ) ? q - 2 : 0;
4179 q >>= 64 - expDiff;
4180 bSig >>= 2;
4181 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4182 }
4183 else {
4184 aSig >>= 2;
4185 bSig >>= 2;
4186 }
4187 do {
4188 alternateASig = aSig;
4189 ++q;
4190 aSig -= bSig;
bb98fe42 4191 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4192 sigMean = aSig + alternateASig;
4193 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4194 aSig = alternateASig;
4195 }
bb98fe42 4196 zSign = ( (int64_t) aSig < 0 );
158142c2 4197 if ( zSign ) aSig = - aSig;
ff32e16e 4198 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4199
4200}
4201
374dfc33
AJ
4202/*----------------------------------------------------------------------------
4203| Returns the binary log of the double-precision floating-point value `a'.
4204| The operation is performed according to the IEC/IEEE Standard for Binary
4205| Floating-Point Arithmetic.
4206*----------------------------------------------------------------------------*/
e5a41ffa 4207float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4208{
4209 flag aSign, zSign;
0c48262d 4210 int aExp;
bb98fe42 4211 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4212 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4213
4214 aSig = extractFloat64Frac( a );
4215 aExp = extractFloat64Exp( a );
4216 aSign = extractFloat64Sign( a );
4217
4218 if ( aExp == 0 ) {
4219 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4220 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4221 }
4222 if ( aSign ) {
ff32e16e 4223 float_raise(float_flag_invalid, status);
af39bc8c 4224 return float64_default_nan(status);
374dfc33
AJ
4225 }
4226 if ( aExp == 0x7FF ) {
ff32e16e
PM
4227 if (aSig) {
4228 return propagateFloat64NaN(a, float64_zero, status);
4229 }
374dfc33
AJ
4230 return a;
4231 }
4232
4233 aExp -= 0x3FF;
4234 aSig |= LIT64( 0x0010000000000000 );
4235 zSign = aExp < 0;
bb98fe42 4236 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4237 for (i = 1LL << 51; i > 0; i >>= 1) {
4238 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4239 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4240 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4241 aSig >>= 1;
4242 zSig |= i;
4243 }
4244 }
4245
4246 if ( zSign )
4247 zSig = -zSig;
ff32e16e 4248 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4249}
4250
158142c2
FB
4251/*----------------------------------------------------------------------------
4252| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4253| corresponding value `b', and 0 otherwise. The invalid exception is raised
4254| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4255| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256*----------------------------------------------------------------------------*/
4257
e5a41ffa 4258int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4259{
bb98fe42 4260 uint64_t av, bv;
ff32e16e
PM
4261 a = float64_squash_input_denormal(a, status);
4262 b = float64_squash_input_denormal(b, status);
158142c2
FB
4263
4264 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4265 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4266 ) {
ff32e16e 4267 float_raise(float_flag_invalid, status);
158142c2
FB
4268 return 0;
4269 }
f090c9d4 4270 av = float64_val(a);
a1b91bb4 4271 bv = float64_val(b);
bb98fe42 4272 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4273
4274}
4275
4276/*----------------------------------------------------------------------------
4277| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4278| equal to the corresponding value `b', and 0 otherwise. The invalid
4279| exception is raised if either operand is a NaN. The comparison is performed
4280| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4281*----------------------------------------------------------------------------*/
4282
e5a41ffa 4283int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4284{
4285 flag aSign, bSign;
bb98fe42 4286 uint64_t av, bv;
ff32e16e
PM
4287 a = float64_squash_input_denormal(a, status);
4288 b = float64_squash_input_denormal(b, status);
158142c2
FB
4289
4290 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4291 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4292 ) {
ff32e16e 4293 float_raise(float_flag_invalid, status);
158142c2
FB
4294 return 0;
4295 }
4296 aSign = extractFloat64Sign( a );
4297 bSign = extractFloat64Sign( b );
f090c9d4 4298 av = float64_val(a);
a1b91bb4 4299 bv = float64_val(b);
bb98fe42 4300 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4301 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4302
4303}
4304
4305/*----------------------------------------------------------------------------
4306| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4307| the corresponding value `b', and 0 otherwise. The invalid exception is
4308| raised if either operand is a NaN. The comparison is performed according
4309| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4310*----------------------------------------------------------------------------*/
4311
e5a41ffa 4312int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4313{
4314 flag aSign, bSign;
bb98fe42 4315 uint64_t av, bv;
158142c2 4316
ff32e16e
PM
4317 a = float64_squash_input_denormal(a, status);
4318 b = float64_squash_input_denormal(b, status);
158142c2
FB
4319 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4320 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4321 ) {
ff32e16e 4322 float_raise(float_flag_invalid, status);
158142c2
FB
4323 return 0;
4324 }
4325 aSign = extractFloat64Sign( a );
4326 bSign = extractFloat64Sign( b );
f090c9d4 4327 av = float64_val(a);
a1b91bb4 4328 bv = float64_val(b);
bb98fe42 4329 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4330 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4331
4332}
4333
67b7861d
AJ
4334/*----------------------------------------------------------------------------
4335| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4336| be compared, and 0 otherwise. The invalid exception is raised if either
4337| operand is a NaN. The comparison is performed according to the IEC/IEEE
4338| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4339*----------------------------------------------------------------------------*/
4340
e5a41ffa 4341int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4342{
ff32e16e
PM
4343 a = float64_squash_input_denormal(a, status);
4344 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4345
4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4348 ) {
ff32e16e 4349 float_raise(float_flag_invalid, status);
67b7861d
AJ
4350 return 1;
4351 }
4352 return 0;
4353}
4354
158142c2
FB
4355/*----------------------------------------------------------------------------
4356| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4357| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4358| exception.The comparison is performed according to the IEC/IEEE Standard
4359| for Binary Floating-Point Arithmetic.
158142c2
FB
4360*----------------------------------------------------------------------------*/
4361
e5a41ffa 4362int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4363{
bb98fe42 4364 uint64_t av, bv;
ff32e16e
PM
4365 a = float64_squash_input_denormal(a, status);
4366 b = float64_squash_input_denormal(b, status);
158142c2
FB
4367
4368 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4369 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4370 ) {
af39bc8c
AM
4371 if (float64_is_signaling_nan(a, status)
4372 || float64_is_signaling_nan(b, status)) {
ff32e16e 4373 float_raise(float_flag_invalid, status);
b689362d 4374 }
158142c2
FB
4375 return 0;
4376 }
f090c9d4 4377 av = float64_val(a);
a1b91bb4 4378 bv = float64_val(b);
bb98fe42 4379 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4380
4381}
4382
4383/*----------------------------------------------------------------------------
4384| Returns 1 if the double-precision floating-point value `a' is less than or
4385| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4386| cause an exception. Otherwise, the comparison is performed according to the
4387| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4388*----------------------------------------------------------------------------*/
4389
e5a41ffa 4390int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4391{
4392 flag aSign, bSign;
bb98fe42 4393 uint64_t av, bv;
ff32e16e
PM
4394 a = float64_squash_input_denormal(a, status);
4395 b = float64_squash_input_denormal(b, status);
158142c2
FB
4396
4397 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4398 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4399 ) {
af39bc8c
AM
4400 if (float64_is_signaling_nan(a, status)
4401 || float64_is_signaling_nan(b, status)) {
ff32e16e 4402 float_raise(float_flag_invalid, status);
158142c2
FB
4403 }
4404 return 0;
4405 }
4406 aSign = extractFloat64Sign( a );
4407 bSign = extractFloat64Sign( b );
f090c9d4 4408 av = float64_val(a);
a1b91bb4 4409 bv = float64_val(b);
bb98fe42 4410 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4411 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4412
4413}
4414
4415/*----------------------------------------------------------------------------
4416| Returns 1 if the double-precision floating-point value `a' is less than
4417| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4418| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4419| Standard for Binary Floating-Point Arithmetic.
4420*----------------------------------------------------------------------------*/
4421
e5a41ffa 4422int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4423{
4424 flag aSign, bSign;
bb98fe42 4425 uint64_t av, bv;
ff32e16e
PM
4426 a = float64_squash_input_denormal(a, status);
4427 b = float64_squash_input_denormal(b, status);
158142c2
FB
4428
4429 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4430 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4431 ) {
af39bc8c
AM
4432 if (float64_is_signaling_nan(a, status)
4433 || float64_is_signaling_nan(b, status)) {
ff32e16e 4434 float_raise(float_flag_invalid, status);
158142c2
FB
4435 }
4436 return 0;
4437 }
4438 aSign = extractFloat64Sign( a );
4439 bSign = extractFloat64Sign( b );
f090c9d4 4440 av = float64_val(a);
a1b91bb4 4441 bv = float64_val(b);
bb98fe42 4442 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4443 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4444
4445}
4446
67b7861d
AJ
4447/*----------------------------------------------------------------------------
4448| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4449| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4450| comparison is performed according to the IEC/IEEE Standard for Binary
4451| Floating-Point Arithmetic.
4452*----------------------------------------------------------------------------*/
4453
e5a41ffa 4454int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4455{
ff32e16e
PM
4456 a = float64_squash_input_denormal(a, status);
4457 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4458
4459 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4460 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4461 ) {
af39bc8c
AM
4462 if (float64_is_signaling_nan(a, status)
4463 || float64_is_signaling_nan(b, status)) {
ff32e16e 4464 float_raise(float_flag_invalid, status);
67b7861d
AJ
4465 }
4466 return 1;
4467 }
4468 return 0;
4469}
4470
158142c2
FB
4471/*----------------------------------------------------------------------------
4472| Returns the result of converting the extended double-precision floating-
4473| point value `a' to the 32-bit two's complement integer format. The
4474| conversion is performed according to the IEC/IEEE Standard for Binary
4475| Floating-Point Arithmetic---which means in particular that the conversion
4476| is rounded according to the current rounding mode. If `a' is a NaN, the
4477| largest positive integer is returned. Otherwise, if the conversion
4478| overflows, the largest integer with the same sign as `a' is returned.
4479*----------------------------------------------------------------------------*/
4480
f4014512 4481int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4482{
4483 flag aSign;
f4014512 4484 int32_t aExp, shiftCount;
bb98fe42 4485 uint64_t aSig;
158142c2 4486
d1eb8f2a
AD
4487 if (floatx80_invalid_encoding(a)) {
4488 float_raise(float_flag_invalid, status);
4489 return 1 << 31;
4490 }
158142c2
FB
4491 aSig = extractFloatx80Frac( a );
4492 aExp = extractFloatx80Exp( a );
4493 aSign = extractFloatx80Sign( a );
bb98fe42 4494 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4495 shiftCount = 0x4037 - aExp;
4496 if ( shiftCount <= 0 ) shiftCount = 1;
4497 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4498 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4499
4500}
4501
4502/*----------------------------------------------------------------------------
4503| Returns the result of converting the extended double-precision floating-
4504| point value `a' to the 32-bit two's complement integer format. The
4505| conversion is performed according to the IEC/IEEE Standard for Binary
4506| Floating-Point Arithmetic, except that the conversion is always rounded
4507| toward zero. If `a' is a NaN, the largest positive integer is returned.
4508| Otherwise, if the conversion overflows, the largest integer with the same
4509| sign as `a' is returned.
4510*----------------------------------------------------------------------------*/
4511
f4014512 4512int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4513{
4514 flag aSign;
f4014512 4515 int32_t aExp, shiftCount;
bb98fe42 4516 uint64_t aSig, savedASig;
b3a6a2e0 4517 int32_t z;
158142c2 4518
d1eb8f2a
AD
4519 if (floatx80_invalid_encoding(a)) {
4520 float_raise(float_flag_invalid, status);
4521 return 1 << 31;
4522 }
158142c2
FB
4523 aSig = extractFloatx80Frac( a );
4524 aExp = extractFloatx80Exp( a );
4525 aSign = extractFloatx80Sign( a );
4526 if ( 0x401E < aExp ) {
bb98fe42 4527 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4528 goto invalid;
4529 }
4530 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4531 if (aExp || aSig) {
4532 status->float_exception_flags |= float_flag_inexact;
4533 }
158142c2
FB
4534 return 0;
4535 }
4536 shiftCount = 0x403E - aExp;
4537 savedASig = aSig;
4538 aSig >>= shiftCount;
4539 z = aSig;
4540 if ( aSign ) z = - z;
4541 if ( ( z < 0 ) ^ aSign ) {
4542 invalid:
ff32e16e 4543 float_raise(float_flag_invalid, status);
bb98fe42 4544 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4545 }
4546 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4547 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4548 }
4549 return z;
4550
4551}
4552
4553/*----------------------------------------------------------------------------
4554| Returns the result of converting the extended double-precision floating-
4555| point value `a' to the 64-bit two's complement integer format. The
4556| conversion is performed according to the IEC/IEEE Standard for Binary
4557| Floating-Point Arithmetic---which means in particular that the conversion
4558| is rounded according to the current rounding mode. If `a' is a NaN,
4559| the largest positive integer is returned. Otherwise, if the conversion
4560| overflows, the largest integer with the same sign as `a' is returned.
4561*----------------------------------------------------------------------------*/
4562
f42c2224 4563int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4564{
4565 flag aSign;
f4014512 4566 int32_t aExp, shiftCount;
bb98fe42 4567 uint64_t aSig, aSigExtra;
158142c2 4568
d1eb8f2a
AD
4569 if (floatx80_invalid_encoding(a)) {
4570 float_raise(float_flag_invalid, status);
4571 return 1ULL << 63;
4572 }
158142c2
FB
4573 aSig = extractFloatx80Frac( a );
4574 aExp = extractFloatx80Exp( a );
4575 aSign = extractFloatx80Sign( a );
4576 shiftCount = 0x403E - aExp;
4577 if ( shiftCount <= 0 ) {
4578 if ( shiftCount ) {
ff32e16e 4579 float_raise(float_flag_invalid, status);
0f605c88 4580 if (!aSign || floatx80_is_any_nan(a)) {
158142c2
FB
4581 return LIT64( 0x7FFFFFFFFFFFFFFF );
4582 }
bb98fe42 4583 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4584 }
4585 aSigExtra = 0;
4586 }
4587 else {
4588 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4589 }
ff32e16e 4590 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4591
4592}
4593
4594/*----------------------------------------------------------------------------
4595| Returns the result of converting the extended double-precision floating-
4596| point value `a' to the 64-bit two's complement integer format. The
4597| conversion is performed according to the IEC/IEEE Standard for Binary
4598| Floating-Point Arithmetic, except that the conversion is always rounded
4599| toward zero. If `a' is a NaN, the largest positive integer is returned.
4600| Otherwise, if the conversion overflows, the largest integer with the same
4601| sign as `a' is returned.
4602*----------------------------------------------------------------------------*/
4603
f42c2224 4604int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4605{
4606 flag aSign;
f4014512 4607 int32_t aExp, shiftCount;
bb98fe42 4608 uint64_t aSig;
f42c2224 4609 int64_t z;
158142c2 4610
d1eb8f2a
AD
4611 if (floatx80_invalid_encoding(a)) {
4612 float_raise(float_flag_invalid, status);
4613 return 1ULL << 63;
4614 }
158142c2
FB
4615 aSig = extractFloatx80Frac( a );
4616 aExp = extractFloatx80Exp( a );
4617 aSign = extractFloatx80Sign( a );
4618 shiftCount = aExp - 0x403E;
4619 if ( 0 <= shiftCount ) {
4620 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4621 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4622 float_raise(float_flag_invalid, status);
158142c2
FB
4623 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4624 return LIT64( 0x7FFFFFFFFFFFFFFF );
4625 }
4626 }
bb98fe42 4627 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4628 }
4629 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4630 if (aExp | aSig) {
4631 status->float_exception_flags |= float_flag_inexact;
4632 }
158142c2
FB
4633 return 0;
4634 }
4635 z = aSig>>( - shiftCount );
bb98fe42 4636 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4637 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4638 }
4639 if ( aSign ) z = - z;
4640 return z;
4641
4642}
4643
4644/*----------------------------------------------------------------------------
4645| Returns the result of converting the extended double-precision floating-
4646| point value `a' to the single-precision floating-point format. The
4647| conversion is performed according to the IEC/IEEE Standard for Binary
4648| Floating-Point Arithmetic.
4649*----------------------------------------------------------------------------*/
4650
e5a41ffa 4651float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4652{
4653 flag aSign;
f4014512 4654 int32_t aExp;
bb98fe42 4655 uint64_t aSig;
158142c2 4656
d1eb8f2a
AD
4657 if (floatx80_invalid_encoding(a)) {
4658 float_raise(float_flag_invalid, status);
4659 return float32_default_nan(status);
4660 }
158142c2
FB
4661 aSig = extractFloatx80Frac( a );
4662 aExp = extractFloatx80Exp( a );
4663 aSign = extractFloatx80Sign( a );
4664 if ( aExp == 0x7FFF ) {
bb98fe42 4665 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4666 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4667 }
4668 return packFloat32( aSign, 0xFF, 0 );
4669 }
4670 shift64RightJamming( aSig, 33, &aSig );
4671 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4672 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4673
4674}
4675
4676/*----------------------------------------------------------------------------
4677| Returns the result of converting the extended double-precision floating-
4678| point value `a' to the double-precision floating-point format. The
4679| conversion is performed according to the IEC/IEEE Standard for Binary
4680| Floating-Point Arithmetic.
4681*----------------------------------------------------------------------------*/
4682
e5a41ffa 4683float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4684{
4685 flag aSign;
f4014512 4686 int32_t aExp;
bb98fe42 4687 uint64_t aSig, zSig;
158142c2 4688
d1eb8f2a
AD
4689 if (floatx80_invalid_encoding(a)) {
4690 float_raise(float_flag_invalid, status);
4691 return float64_default_nan(status);
4692 }
158142c2
FB
4693 aSig = extractFloatx80Frac( a );
4694 aExp = extractFloatx80Exp( a );
4695 aSign = extractFloatx80Sign( a );
4696 if ( aExp == 0x7FFF ) {
bb98fe42 4697 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4698 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4699 }
4700 return packFloat64( aSign, 0x7FF, 0 );
4701 }
4702 shift64RightJamming( aSig, 1, &zSig );
4703 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4704 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4705
4706}
4707
158142c2
FB
4708/*----------------------------------------------------------------------------
4709| Returns the result of converting the extended double-precision floating-
4710| point value `a' to the quadruple-precision floating-point format. The
4711| conversion is performed according to the IEC/IEEE Standard for Binary
4712| Floating-Point Arithmetic.
4713*----------------------------------------------------------------------------*/
4714
e5a41ffa 4715float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
4716{
4717 flag aSign;
0c48262d 4718 int aExp;
bb98fe42 4719 uint64_t aSig, zSig0, zSig1;
158142c2 4720
d1eb8f2a
AD
4721 if (floatx80_invalid_encoding(a)) {
4722 float_raise(float_flag_invalid, status);
4723 return float128_default_nan(status);
4724 }
158142c2
FB
4725 aSig = extractFloatx80Frac( a );
4726 aExp = extractFloatx80Exp( a );
4727 aSign = extractFloatx80Sign( a );
bb98fe42 4728 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4729 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4730 }
4731 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4732 return packFloat128( aSign, aExp, zSig0, zSig1 );
4733
4734}
4735
0f721292
LV
4736/*----------------------------------------------------------------------------
4737| Rounds the extended double-precision floating-point value `a'
4738| to the precision provided by floatx80_rounding_precision and returns the
4739| result as an extended double-precision floating-point value.
4740| The operation is performed according to the IEC/IEEE Standard for Binary
4741| Floating-Point Arithmetic.
4742*----------------------------------------------------------------------------*/
4743
4744floatx80 floatx80_round(floatx80 a, float_status *status)
4745{
4746 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4747 extractFloatx80Sign(a),
4748 extractFloatx80Exp(a),
4749 extractFloatx80Frac(a), 0, status);
4750}
4751
158142c2
FB
4752/*----------------------------------------------------------------------------
4753| Rounds the extended double-precision floating-point value `a' to an integer,
4754| and returns the result as an extended quadruple-precision floating-point
4755| value. The operation is performed according to the IEC/IEEE Standard for
4756| Binary Floating-Point Arithmetic.
4757*----------------------------------------------------------------------------*/
4758
e5a41ffa 4759floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
4760{
4761 flag aSign;
f4014512 4762 int32_t aExp;
bb98fe42 4763 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4764 floatx80 z;
4765
d1eb8f2a
AD
4766 if (floatx80_invalid_encoding(a)) {
4767 float_raise(float_flag_invalid, status);
4768 return floatx80_default_nan(status);
4769 }
158142c2
FB
4770 aExp = extractFloatx80Exp( a );
4771 if ( 0x403E <= aExp ) {
bb98fe42 4772 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 4773 return propagateFloatx80NaN(a, a, status);
158142c2
FB
4774 }
4775 return a;
4776 }
4777 if ( aExp < 0x3FFF ) {
4778 if ( ( aExp == 0 )
bb98fe42 4779 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4780 return a;
4781 }
a2f2d288 4782 status->float_exception_flags |= float_flag_inexact;
158142c2 4783 aSign = extractFloatx80Sign( a );
a2f2d288 4784 switch (status->float_rounding_mode) {
158142c2 4785 case float_round_nearest_even:
bb98fe42 4786 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4787 ) {
4788 return
4789 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4790 }
4791 break;
f9288a76
PM
4792 case float_round_ties_away:
4793 if (aExp == 0x3FFE) {
4794 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4795 }
4796 break;
158142c2
FB
4797 case float_round_down:
4798 return
4799 aSign ?
4800 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4801 : packFloatx80( 0, 0, 0 );
4802 case float_round_up:
4803 return
4804 aSign ? packFloatx80( 1, 0, 0 )
4805 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4806 }
4807 return packFloatx80( aSign, 0, 0 );
4808 }
4809 lastBitMask = 1;
4810 lastBitMask <<= 0x403E - aExp;
4811 roundBitsMask = lastBitMask - 1;
4812 z = a;
a2f2d288 4813 switch (status->float_rounding_mode) {
dc355b76 4814 case float_round_nearest_even:
158142c2 4815 z.low += lastBitMask>>1;
dc355b76
PM
4816 if ((z.low & roundBitsMask) == 0) {
4817 z.low &= ~lastBitMask;
4818 }
4819 break;
f9288a76
PM
4820 case float_round_ties_away:
4821 z.low += lastBitMask >> 1;
4822 break;
dc355b76
PM
4823 case float_round_to_zero:
4824 break;
4825 case float_round_up:
4826 if (!extractFloatx80Sign(z)) {
4827 z.low += roundBitsMask;
4828 }
4829 break;
4830 case float_round_down:
4831 if (extractFloatx80Sign(z)) {
158142c2
FB
4832 z.low += roundBitsMask;
4833 }
dc355b76
PM
4834 break;
4835 default:
4836 abort();
158142c2
FB
4837 }
4838 z.low &= ~ roundBitsMask;
4839 if ( z.low == 0 ) {
4840 ++z.high;
4841 z.low = LIT64( 0x8000000000000000 );
4842 }
a2f2d288
PM
4843 if (z.low != a.low) {
4844 status->float_exception_flags |= float_flag_inexact;
4845 }
158142c2
FB
4846 return z;
4847
4848}
4849
4850/*----------------------------------------------------------------------------
4851| Returns the result of adding the absolute values of the extended double-
4852| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4853| negated before being returned. `zSign' is ignored if the result is a NaN.
4854| The addition is performed according to the IEC/IEEE Standard for Binary
4855| Floating-Point Arithmetic.
4856*----------------------------------------------------------------------------*/
4857
e5a41ffa
PM
4858static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4859 float_status *status)
158142c2 4860{
f4014512 4861 int32_t aExp, bExp, zExp;
bb98fe42 4862 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 4863 int32_t expDiff;
158142c2
FB
4864
4865 aSig = extractFloatx80Frac( a );
4866 aExp = extractFloatx80Exp( a );
4867 bSig = extractFloatx80Frac( b );
4868 bExp = extractFloatx80Exp( b );
4869 expDiff = aExp - bExp;
4870 if ( 0 < expDiff ) {
4871 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4872 if ((uint64_t)(aSig << 1)) {
4873 return propagateFloatx80NaN(a, b, status);
4874 }
158142c2
FB
4875 return a;
4876 }
4877 if ( bExp == 0 ) --expDiff;
4878 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4879 zExp = aExp;
4880 }
4881 else if ( expDiff < 0 ) {
4882 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4883 if ((uint64_t)(bSig << 1)) {
4884 return propagateFloatx80NaN(a, b, status);
4885 }
0f605c88
LV
4886 return packFloatx80(zSign,
4887 floatx80_infinity_high,
4888 floatx80_infinity_low);
158142c2
FB
4889 }
4890 if ( aExp == 0 ) ++expDiff;
4891 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4892 zExp = bExp;
4893 }
4894 else {
4895 if ( aExp == 0x7FFF ) {
bb98fe42 4896 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 4897 return propagateFloatx80NaN(a, b, status);
158142c2
FB
4898 }
4899 return a;
4900 }
4901 zSig1 = 0;
4902 zSig0 = aSig + bSig;
4903 if ( aExp == 0 ) {
4904 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4905 goto roundAndPack;
4906 }
4907 zExp = aExp;
4908 goto shiftRight1;
4909 }
4910 zSig0 = aSig + bSig;
bb98fe42 4911 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4912 shiftRight1:
4913 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4914 zSig0 |= LIT64( 0x8000000000000000 );
4915 ++zExp;
4916 roundAndPack:
a2f2d288 4917 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4918 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4919}
4920
4921/*----------------------------------------------------------------------------
4922| Returns the result of subtracting the absolute values of the extended
4923| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4924| difference is negated before being returned. `zSign' is ignored if the
4925| result is a NaN. The subtraction is performed according to the IEC/IEEE
4926| Standard for Binary Floating-Point Arithmetic.
4927*----------------------------------------------------------------------------*/
4928
e5a41ffa
PM
4929static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4930 float_status *status)
158142c2 4931{
f4014512 4932 int32_t aExp, bExp, zExp;
bb98fe42 4933 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 4934 int32_t expDiff;
158142c2
FB
4935
4936 aSig = extractFloatx80Frac( a );
4937 aExp = extractFloatx80Exp( a );
4938 bSig = extractFloatx80Frac( b );
4939 bExp = extractFloatx80Exp( b );
4940 expDiff = aExp - bExp;
4941 if ( 0 < expDiff ) goto aExpBigger;
4942 if ( expDiff < 0 ) goto bExpBigger;
4943 if ( aExp == 0x7FFF ) {
bb98fe42 4944 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 4945 return propagateFloatx80NaN(a, b, status);
158142c2 4946 }
ff32e16e 4947 float_raise(float_flag_invalid, status);
af39bc8c 4948 return floatx80_default_nan(status);
158142c2
FB
4949 }
4950 if ( aExp == 0 ) {
4951 aExp = 1;
4952 bExp = 1;
4953 }
4954 zSig1 = 0;
4955 if ( bSig < aSig ) goto aBigger;
4956 if ( aSig < bSig ) goto bBigger;
a2f2d288 4957 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
4958 bExpBigger:
4959 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4960 if ((uint64_t)(bSig << 1)) {
4961 return propagateFloatx80NaN(a, b, status);
4962 }
0f605c88
LV
4963 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4964 floatx80_infinity_low);
158142c2
FB
4965 }
4966 if ( aExp == 0 ) ++expDiff;
4967 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4968 bBigger:
4969 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4970 zExp = bExp;
4971 zSign ^= 1;
4972 goto normalizeRoundAndPack;
4973 aExpBigger:
4974 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4975 if ((uint64_t)(aSig << 1)) {
4976 return propagateFloatx80NaN(a, b, status);
4977 }
158142c2
FB
4978 return a;
4979 }
4980 if ( bExp == 0 ) --expDiff;
4981 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4982 aBigger:
4983 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4984 zExp = aExp;
4985 normalizeRoundAndPack:
a2f2d288 4986 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4987 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4988}
4989
4990/*----------------------------------------------------------------------------
4991| Returns the result of adding the extended double-precision floating-point
4992| values `a' and `b'. The operation is performed according to the IEC/IEEE
4993| Standard for Binary Floating-Point Arithmetic.
4994*----------------------------------------------------------------------------*/
4995
e5a41ffa 4996floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4997{
4998 flag aSign, bSign;
4999
d1eb8f2a
AD
5000 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5001 float_raise(float_flag_invalid, status);
5002 return floatx80_default_nan(status);
5003 }
158142c2
FB
5004 aSign = extractFloatx80Sign( a );
5005 bSign = extractFloatx80Sign( b );
5006 if ( aSign == bSign ) {
ff32e16e 5007 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5008 }
5009 else {
ff32e16e 5010 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5011 }
5012
5013}
5014
5015/*----------------------------------------------------------------------------
5016| Returns the result of subtracting the extended double-precision floating-
5017| point values `a' and `b'. The operation is performed according to the
5018| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019*----------------------------------------------------------------------------*/
5020
e5a41ffa 5021floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5022{
5023 flag aSign, bSign;
5024
d1eb8f2a
AD
5025 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5026 float_raise(float_flag_invalid, status);
5027 return floatx80_default_nan(status);
5028 }
158142c2
FB
5029 aSign = extractFloatx80Sign( a );
5030 bSign = extractFloatx80Sign( b );
5031 if ( aSign == bSign ) {
ff32e16e 5032 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5033 }
5034 else {
ff32e16e 5035 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5036 }
5037
5038}
5039
5040/*----------------------------------------------------------------------------
5041| Returns the result of multiplying the extended double-precision floating-
5042| point values `a' and `b'. The operation is performed according to the
5043| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5044*----------------------------------------------------------------------------*/
5045
e5a41ffa 5046floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5047{
5048 flag aSign, bSign, zSign;
f4014512 5049 int32_t aExp, bExp, zExp;
bb98fe42 5050 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 5051
d1eb8f2a
AD
5052 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5053 float_raise(float_flag_invalid, status);
5054 return floatx80_default_nan(status);
5055 }
158142c2
FB
5056 aSig = extractFloatx80Frac( a );
5057 aExp = extractFloatx80Exp( a );
5058 aSign = extractFloatx80Sign( a );
5059 bSig = extractFloatx80Frac( b );
5060 bExp = extractFloatx80Exp( b );
5061 bSign = extractFloatx80Sign( b );
5062 zSign = aSign ^ bSign;
5063 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5064 if ( (uint64_t) ( aSig<<1 )
5065 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5066 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5067 }
5068 if ( ( bExp | bSig ) == 0 ) goto invalid;
0f605c88
LV
5069 return packFloatx80(zSign, floatx80_infinity_high,
5070 floatx80_infinity_low);
158142c2
FB
5071 }
5072 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5073 if ((uint64_t)(bSig << 1)) {
5074 return propagateFloatx80NaN(a, b, status);
5075 }
158142c2
FB
5076 if ( ( aExp | aSig ) == 0 ) {
5077 invalid:
ff32e16e 5078 float_raise(float_flag_invalid, status);
af39bc8c 5079 return floatx80_default_nan(status);
158142c2 5080 }
0f605c88
LV
5081 return packFloatx80(zSign, floatx80_infinity_high,
5082 floatx80_infinity_low);
158142c2
FB
5083 }
5084 if ( aExp == 0 ) {
5085 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5086 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5087 }
5088 if ( bExp == 0 ) {
5089 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5090 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5091 }
5092 zExp = aExp + bExp - 0x3FFE;
5093 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5094 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5095 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5096 --zExp;
5097 }
a2f2d288 5098 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5099 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5100}
5101
5102/*----------------------------------------------------------------------------
5103| Returns the result of dividing the extended double-precision floating-point
5104| value `a' by the corresponding value `b'. The operation is performed
5105| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106*----------------------------------------------------------------------------*/
5107
e5a41ffa 5108floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5109{
5110 flag aSign, bSign, zSign;
f4014512 5111 int32_t aExp, bExp, zExp;
bb98fe42
AF
5112 uint64_t aSig, bSig, zSig0, zSig1;
5113 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2 5114
d1eb8f2a
AD
5115 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5116 float_raise(float_flag_invalid, status);
5117 return floatx80_default_nan(status);
5118 }
158142c2
FB
5119 aSig = extractFloatx80Frac( a );
5120 aExp = extractFloatx80Exp( a );
5121 aSign = extractFloatx80Sign( a );
5122 bSig = extractFloatx80Frac( b );
5123 bExp = extractFloatx80Exp( b );
5124 bSign = extractFloatx80Sign( b );
5125 zSign = aSign ^ bSign;
5126 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5127 if ((uint64_t)(aSig << 1)) {
5128 return propagateFloatx80NaN(a, b, status);
5129 }
158142c2 5130 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5131 if ((uint64_t)(bSig << 1)) {
5132 return propagateFloatx80NaN(a, b, status);
5133 }
158142c2
FB
5134 goto invalid;
5135 }
0f605c88
LV
5136 return packFloatx80(zSign, floatx80_infinity_high,
5137 floatx80_infinity_low);
158142c2
FB
5138 }
5139 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5140 if ((uint64_t)(bSig << 1)) {
5141 return propagateFloatx80NaN(a, b, status);
5142 }
158142c2
FB
5143 return packFloatx80( zSign, 0, 0 );
5144 }
5145 if ( bExp == 0 ) {
5146 if ( bSig == 0 ) {
5147 if ( ( aExp | aSig ) == 0 ) {
5148 invalid:
ff32e16e 5149 float_raise(float_flag_invalid, status);
af39bc8c 5150 return floatx80_default_nan(status);
158142c2 5151 }
ff32e16e 5152 float_raise(float_flag_divbyzero, status);
0f605c88
LV
5153 return packFloatx80(zSign, floatx80_infinity_high,
5154 floatx80_infinity_low);
158142c2
FB
5155 }
5156 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5157 }
5158 if ( aExp == 0 ) {
5159 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5160 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5161 }
5162 zExp = aExp - bExp + 0x3FFE;
5163 rem1 = 0;
5164 if ( bSig <= aSig ) {
5165 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5166 ++zExp;
5167 }
5168 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5169 mul64To128( bSig, zSig0, &term0, &term1 );
5170 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5171 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5172 --zSig0;
5173 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5174 }
5175 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5176 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5177 mul64To128( bSig, zSig1, &term1, &term2 );
5178 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5179 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5180 --zSig1;
5181 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5182 }
5183 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5184 }
a2f2d288 5185 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5186 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5187}
5188
5189/*----------------------------------------------------------------------------
5190| Returns the remainder of the extended double-precision floating-point value
5191| `a' with respect to the corresponding value `b'. The operation is performed
5192| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5193*----------------------------------------------------------------------------*/
5194
e5a41ffa 5195floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5196{
ed086f3d 5197 flag aSign, zSign;
f4014512 5198 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5199 uint64_t aSig0, aSig1, bSig;
5200 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2 5201
d1eb8f2a
AD
5202 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5203 float_raise(float_flag_invalid, status);
5204 return floatx80_default_nan(status);
5205 }
158142c2
FB
5206 aSig0 = extractFloatx80Frac( a );
5207 aExp = extractFloatx80Exp( a );
5208 aSign = extractFloatx80Sign( a );
5209 bSig = extractFloatx80Frac( b );
5210 bExp = extractFloatx80Exp( b );
158142c2 5211 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5212 if ( (uint64_t) ( aSig0<<1 )
5213 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5214 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5215 }
5216 goto invalid;
5217 }
5218 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5219 if ((uint64_t)(bSig << 1)) {
5220 return propagateFloatx80NaN(a, b, status);
5221 }
158142c2
FB
5222 return a;
5223 }
5224 if ( bExp == 0 ) {
5225 if ( bSig == 0 ) {
5226 invalid:
ff32e16e 5227 float_raise(float_flag_invalid, status);
af39bc8c 5228 return floatx80_default_nan(status);
158142c2
FB
5229 }
5230 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5231 }
5232 if ( aExp == 0 ) {
bb98fe42 5233 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5234 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5235 }
5236 bSig |= LIT64( 0x8000000000000000 );
5237 zSign = aSign;
5238 expDiff = aExp - bExp;
5239 aSig1 = 0;
5240 if ( expDiff < 0 ) {
5241 if ( expDiff < -1 ) return a;
5242 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5243 expDiff = 0;
5244 }
5245 q = ( bSig <= aSig0 );
5246 if ( q ) aSig0 -= bSig;
5247 expDiff -= 64;
5248 while ( 0 < expDiff ) {
5249 q = estimateDiv128To64( aSig0, aSig1, bSig );
5250 q = ( 2 < q ) ? q - 2 : 0;
5251 mul64To128( bSig, q, &term0, &term1 );
5252 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5253 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5254 expDiff -= 62;
5255 }
5256 expDiff += 64;
5257 if ( 0 < expDiff ) {
5258 q = estimateDiv128To64( aSig0, aSig1, bSig );
5259 q = ( 2 < q ) ? q - 2 : 0;
5260 q >>= 64 - expDiff;
5261 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5262 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5263 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5264 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5265 ++q;
5266 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5267 }
5268 }
5269 else {
5270 term1 = 0;
5271 term0 = bSig;
5272 }
5273 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5274 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5275 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5276 && ( q & 1 ) )
5277 ) {
5278 aSig0 = alternateASig0;
5279 aSig1 = alternateASig1;
5280 zSign = ! zSign;
5281 }
5282 return
5283 normalizeRoundAndPackFloatx80(
ff32e16e 5284 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5285
5286}
5287
5288/*----------------------------------------------------------------------------
5289| Returns the square root of the extended double-precision floating-point
5290| value `a'. The operation is performed according to the IEC/IEEE Standard
5291| for Binary Floating-Point Arithmetic.
5292*----------------------------------------------------------------------------*/
5293
e5a41ffa 5294floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5295{
5296 flag aSign;
f4014512 5297 int32_t aExp, zExp;
bb98fe42
AF
5298 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5299 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2 5300
d1eb8f2a
AD
5301 if (floatx80_invalid_encoding(a)) {
5302 float_raise(float_flag_invalid, status);
5303 return floatx80_default_nan(status);
5304 }
158142c2
FB
5305 aSig0 = extractFloatx80Frac( a );
5306 aExp = extractFloatx80Exp( a );
5307 aSign = extractFloatx80Sign( a );
5308 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5309 if ((uint64_t)(aSig0 << 1)) {
5310 return propagateFloatx80NaN(a, a, status);
5311 }
158142c2
FB
5312 if ( ! aSign ) return a;
5313 goto invalid;
5314 }
5315 if ( aSign ) {
5316 if ( ( aExp | aSig0 ) == 0 ) return a;
5317 invalid:
ff32e16e 5318 float_raise(float_flag_invalid, status);
af39bc8c 5319 return floatx80_default_nan(status);
158142c2
FB
5320 }
5321 if ( aExp == 0 ) {
5322 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5323 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5324 }
5325 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5326 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5327 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5328 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5329 doubleZSig0 = zSig0<<1;
5330 mul64To128( zSig0, zSig0, &term0, &term1 );
5331 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5332 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5333 --zSig0;
5334 doubleZSig0 -= 2;
5335 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5336 }
5337 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5338 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5339 if ( zSig1 == 0 ) zSig1 = 1;
5340 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5341 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5342 mul64To128( zSig1, zSig1, &term2, &term3 );
5343 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5344 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5345 --zSig1;
5346 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5347 term3 |= 1;
5348 term2 |= doubleZSig0;
5349 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5350 }
5351 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5352 }
5353 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5354 zSig0 |= doubleZSig0;
a2f2d288
PM
5355 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5356 0, zExp, zSig0, zSig1, status);
158142c2
FB
5357}
5358
5359/*----------------------------------------------------------------------------
b689362d
AJ
5360| Returns 1 if the extended double-precision floating-point value `a' is equal
5361| to the corresponding value `b', and 0 otherwise. The invalid exception is
5362| raised if either operand is a NaN. Otherwise, the comparison is performed
5363| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5364*----------------------------------------------------------------------------*/
5365
e5a41ffa 5366int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5367{
5368
d1eb8f2a
AD
5369 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5370 || (extractFloatx80Exp(a) == 0x7FFF
5371 && (uint64_t) (extractFloatx80Frac(a) << 1))
5372 || (extractFloatx80Exp(b) == 0x7FFF
5373 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5374 ) {
ff32e16e 5375 float_raise(float_flag_invalid, status);
158142c2
FB
5376 return 0;
5377 }
5378 return
5379 ( a.low == b.low )
5380 && ( ( a.high == b.high )
5381 || ( ( a.low == 0 )
bb98fe42 5382 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5383 );
5384
5385}
5386
5387/*----------------------------------------------------------------------------
5388| Returns 1 if the extended double-precision floating-point value `a' is
5389| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5390| invalid exception is raised if either operand is a NaN. The comparison is
5391| performed according to the IEC/IEEE Standard for Binary Floating-Point
5392| Arithmetic.
158142c2
FB
5393*----------------------------------------------------------------------------*/
5394
e5a41ffa 5395int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5396{
5397 flag aSign, bSign;
5398
d1eb8f2a
AD
5399 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5400 || (extractFloatx80Exp(a) == 0x7FFF
5401 && (uint64_t) (extractFloatx80Frac(a) << 1))
5402 || (extractFloatx80Exp(b) == 0x7FFF
5403 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5404 ) {
ff32e16e 5405 float_raise(float_flag_invalid, status);
158142c2
FB
5406 return 0;
5407 }
5408 aSign = extractFloatx80Sign( a );
5409 bSign = extractFloatx80Sign( b );
5410 if ( aSign != bSign ) {
5411 return
5412 aSign
bb98fe42 5413 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5414 == 0 );
5415 }
5416 return
5417 aSign ? le128( b.high, b.low, a.high, a.low )
5418 : le128( a.high, a.low, b.high, b.low );
5419
5420}
5421
5422/*----------------------------------------------------------------------------
5423| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5424| less than the corresponding value `b', and 0 otherwise. The invalid
5425| exception is raised if either operand is a NaN. The comparison is performed
5426| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5427*----------------------------------------------------------------------------*/
5428
e5a41ffa 5429int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5430{
5431 flag aSign, bSign;
5432
d1eb8f2a
AD
5433 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5434 || (extractFloatx80Exp(a) == 0x7FFF
5435 && (uint64_t) (extractFloatx80Frac(a) << 1))
5436 || (extractFloatx80Exp(b) == 0x7FFF
5437 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5438 ) {
ff32e16e 5439 float_raise(float_flag_invalid, status);
158142c2
FB
5440 return 0;
5441 }
5442 aSign = extractFloatx80Sign( a );
5443 bSign = extractFloatx80Sign( b );
5444 if ( aSign != bSign ) {
5445 return
5446 aSign
bb98fe42 5447 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5448 != 0 );
5449 }
5450 return
5451 aSign ? lt128( b.high, b.low, a.high, a.low )
5452 : lt128( a.high, a.low, b.high, b.low );
5453
5454}
5455
67b7861d
AJ
5456/*----------------------------------------------------------------------------
5457| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5458| cannot be compared, and 0 otherwise. The invalid exception is raised if
5459| either operand is a NaN. The comparison is performed according to the
5460| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5461*----------------------------------------------------------------------------*/
e5a41ffa 5462int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d 5463{
d1eb8f2a
AD
5464 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5465 || (extractFloatx80Exp(a) == 0x7FFF
5466 && (uint64_t) (extractFloatx80Frac(a) << 1))
5467 || (extractFloatx80Exp(b) == 0x7FFF
5468 && (uint64_t) (extractFloatx80Frac(b) << 1))
67b7861d 5469 ) {
ff32e16e 5470 float_raise(float_flag_invalid, status);
67b7861d
AJ
5471 return 1;
5472 }
5473 return 0;
5474}
5475
158142c2 5476/*----------------------------------------------------------------------------
b689362d 5477| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5478| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5479| cause an exception. The comparison is performed according to the IEC/IEEE
5480| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5481*----------------------------------------------------------------------------*/
5482
e5a41ffa 5483int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5484{
5485
d1eb8f2a
AD
5486 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5487 float_raise(float_flag_invalid, status);
5488 return 0;
5489 }
158142c2 5490 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5491 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5492 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5493 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5494 ) {
af39bc8c
AM
5495 if (floatx80_is_signaling_nan(a, status)
5496 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5497 float_raise(float_flag_invalid, status);
b689362d 5498 }
158142c2
FB
5499 return 0;
5500 }
5501 return
5502 ( a.low == b.low )
5503 && ( ( a.high == b.high )
5504 || ( ( a.low == 0 )
bb98fe42 5505 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5506 );
5507
5508}
5509
5510/*----------------------------------------------------------------------------
5511| Returns 1 if the extended double-precision floating-point value `a' is less
5512| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5513| do not cause an exception. Otherwise, the comparison is performed according
5514| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5515*----------------------------------------------------------------------------*/
5516
e5a41ffa 5517int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5518{
5519 flag aSign, bSign;
5520
d1eb8f2a
AD
5521 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5522 float_raise(float_flag_invalid, status);
5523 return 0;
5524 }
158142c2 5525 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5526 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5527 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5528 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5529 ) {
af39bc8c
AM
5530 if (floatx80_is_signaling_nan(a, status)
5531 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5532 float_raise(float_flag_invalid, status);
158142c2
FB
5533 }
5534 return 0;
5535 }
5536 aSign = extractFloatx80Sign( a );
5537 bSign = extractFloatx80Sign( b );
5538 if ( aSign != bSign ) {
5539 return
5540 aSign
bb98fe42 5541 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5542 == 0 );
5543 }
5544 return
5545 aSign ? le128( b.high, b.low, a.high, a.low )
5546 : le128( a.high, a.low, b.high, b.low );
5547
5548}
5549
5550/*----------------------------------------------------------------------------
5551| Returns 1 if the extended double-precision floating-point value `a' is less
5552| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5553| an exception. Otherwise, the comparison is performed according to the
5554| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555*----------------------------------------------------------------------------*/
5556
e5a41ffa 5557int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5558{
5559 flag aSign, bSign;
5560
d1eb8f2a
AD
5561 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5562 float_raise(float_flag_invalid, status);
5563 return 0;
5564 }
158142c2 5565 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5566 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5567 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5568 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5569 ) {
af39bc8c
AM
5570 if (floatx80_is_signaling_nan(a, status)
5571 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5572 float_raise(float_flag_invalid, status);
158142c2
FB
5573 }
5574 return 0;
5575 }
5576 aSign = extractFloatx80Sign( a );
5577 bSign = extractFloatx80Sign( b );
5578 if ( aSign != bSign ) {
5579 return
5580 aSign
bb98fe42 5581 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5582 != 0 );
5583 }
5584 return
5585 aSign ? lt128( b.high, b.low, a.high, a.low )
5586 : lt128( a.high, a.low, b.high, b.low );
5587
5588}
5589
67b7861d
AJ
5590/*----------------------------------------------------------------------------
5591| Returns 1 if the extended double-precision floating-point values `a' and `b'
5592| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5593| The comparison is performed according to the IEC/IEEE Standard for Binary
5594| Floating-Point Arithmetic.
5595*----------------------------------------------------------------------------*/
e5a41ffa 5596int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d 5597{
d1eb8f2a
AD
5598 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5599 float_raise(float_flag_invalid, status);
5600 return 1;
5601 }
67b7861d
AJ
5602 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5603 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5604 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5605 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5606 ) {
af39bc8c
AM
5607 if (floatx80_is_signaling_nan(a, status)
5608 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5609 float_raise(float_flag_invalid, status);
67b7861d
AJ
5610 }
5611 return 1;
5612 }
5613 return 0;
5614}
5615
158142c2
FB
5616/*----------------------------------------------------------------------------
5617| Returns the result of converting the quadruple-precision floating-point
5618| value `a' to the 32-bit two's complement integer format. The conversion
5619| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5620| Arithmetic---which means in particular that the conversion is rounded
5621| according to the current rounding mode. If `a' is a NaN, the largest
5622| positive integer is returned. Otherwise, if the conversion overflows, the
5623| largest integer with the same sign as `a' is returned.
5624*----------------------------------------------------------------------------*/
5625
f4014512 5626int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5627{
5628 flag aSign;
f4014512 5629 int32_t aExp, shiftCount;
bb98fe42 5630 uint64_t aSig0, aSig1;
158142c2
FB
5631
5632 aSig1 = extractFloat128Frac1( a );
5633 aSig0 = extractFloat128Frac0( a );
5634 aExp = extractFloat128Exp( a );
5635 aSign = extractFloat128Sign( a );
5636 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5637 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5638 aSig0 |= ( aSig1 != 0 );
5639 shiftCount = 0x4028 - aExp;
5640 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5641 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5642
5643}
5644
5645/*----------------------------------------------------------------------------
5646| Returns the result of converting the quadruple-precision floating-point
5647| value `a' to the 32-bit two's complement integer format. The conversion
5648| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5649| Arithmetic, except that the conversion is always rounded toward zero. If
5650| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5651| conversion overflows, the largest integer with the same sign as `a' is
5652| returned.
5653*----------------------------------------------------------------------------*/
5654
f4014512 5655int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5656{
5657 flag aSign;
f4014512 5658 int32_t aExp, shiftCount;
bb98fe42 5659 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5660 int32_t z;
158142c2
FB
5661
5662 aSig1 = extractFloat128Frac1( a );
5663 aSig0 = extractFloat128Frac0( a );
5664 aExp = extractFloat128Exp( a );
5665 aSign = extractFloat128Sign( a );
5666 aSig0 |= ( aSig1 != 0 );
5667 if ( 0x401E < aExp ) {
5668 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5669 goto invalid;
5670 }
5671 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5672 if (aExp || aSig0) {
5673 status->float_exception_flags |= float_flag_inexact;
5674 }
158142c2
FB
5675 return 0;
5676 }
5677 aSig0 |= LIT64( 0x0001000000000000 );
5678 shiftCount = 0x402F - aExp;
5679 savedASig = aSig0;
5680 aSig0 >>= shiftCount;
5681 z = aSig0;
5682 if ( aSign ) z = - z;
5683 if ( ( z < 0 ) ^ aSign ) {
5684 invalid:
ff32e16e 5685 float_raise(float_flag_invalid, status);
bb98fe42 5686 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5687 }
5688 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 5689 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5690 }
5691 return z;
5692
5693}
5694
5695/*----------------------------------------------------------------------------
5696| Returns the result of converting the quadruple-precision floating-point
5697| value `a' to the 64-bit two's complement integer format. The conversion
5698| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5699| Arithmetic---which means in particular that the conversion is rounded
5700| according to the current rounding mode. If `a' is a NaN, the largest
5701| positive integer is returned. Otherwise, if the conversion overflows, the
5702| largest integer with the same sign as `a' is returned.
5703*----------------------------------------------------------------------------*/
5704
f42c2224 5705int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
5706{
5707 flag aSign;
f4014512 5708 int32_t aExp, shiftCount;
bb98fe42 5709 uint64_t aSig0, aSig1;
158142c2
FB
5710
5711 aSig1 = extractFloat128Frac1( a );
5712 aSig0 = extractFloat128Frac0( a );
5713 aExp = extractFloat128Exp( a );
5714 aSign = extractFloat128Sign( a );
5715 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5716 shiftCount = 0x402F - aExp;
5717 if ( shiftCount <= 0 ) {
5718 if ( 0x403E < aExp ) {
ff32e16e 5719 float_raise(float_flag_invalid, status);
158142c2
FB
5720 if ( ! aSign
5721 || ( ( aExp == 0x7FFF )
5722 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5723 )
5724 ) {
5725 return LIT64( 0x7FFFFFFFFFFFFFFF );
5726 }
bb98fe42 5727 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5728 }
5729 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5730 }
5731 else {
5732 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5733 }
ff32e16e 5734 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5735
5736}
5737
5738/*----------------------------------------------------------------------------
5739| Returns the result of converting the quadruple-precision floating-point
5740| value `a' to the 64-bit two's complement integer format. The conversion
5741| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742| Arithmetic, except that the conversion is always rounded toward zero.
5743| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5744| the conversion overflows, the largest integer with the same sign as `a' is
5745| returned.
5746*----------------------------------------------------------------------------*/
5747
f42c2224 5748int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5749{
5750 flag aSign;
f4014512 5751 int32_t aExp, shiftCount;
bb98fe42 5752 uint64_t aSig0, aSig1;
f42c2224 5753 int64_t z;
158142c2
FB
5754
5755 aSig1 = extractFloat128Frac1( a );
5756 aSig0 = extractFloat128Frac0( a );
5757 aExp = extractFloat128Exp( a );
5758 aSign = extractFloat128Sign( a );
5759 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5760 shiftCount = aExp - 0x402F;
5761 if ( 0 < shiftCount ) {
5762 if ( 0x403E <= aExp ) {
5763 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5764 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5765 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
5766 if (aSig1) {
5767 status->float_exception_flags |= float_flag_inexact;
5768 }
158142c2
FB
5769 }
5770 else {
ff32e16e 5771 float_raise(float_flag_invalid, status);
158142c2
FB
5772 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5773 return LIT64( 0x7FFFFFFFFFFFFFFF );
5774 }
5775 }
bb98fe42 5776 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5777 }
5778 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5779 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 5780 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5781 }
5782 }
5783 else {
5784 if ( aExp < 0x3FFF ) {
5785 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 5786 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5787 }
5788 return 0;
5789 }
5790 z = aSig0>>( - shiftCount );
5791 if ( aSig1
bb98fe42 5792 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 5793 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5794 }
5795 }
5796 if ( aSign ) z = - z;
5797 return z;
5798
5799}
5800
2e6d8568
BR
5801/*----------------------------------------------------------------------------
5802| Returns the result of converting the quadruple-precision floating-point value
5803| `a' to the 64-bit unsigned integer format. The conversion is
5804| performed according to the IEC/IEEE Standard for Binary Floating-Point
5805| Arithmetic---which means in particular that the conversion is rounded
5806| according to the current rounding mode. If `a' is a NaN, the largest
5807| positive integer is returned. If the conversion overflows, the
5808| largest unsigned integer is returned. If 'a' is negative, the value is
5809| rounded and zero is returned; negative values that do not round to zero
5810| will raise the inexact exception.
5811*----------------------------------------------------------------------------*/
5812
5813uint64_t float128_to_uint64(float128 a, float_status *status)
5814{
5815 flag aSign;
5816 int aExp;
5817 int shiftCount;
5818 uint64_t aSig0, aSig1;
5819
5820 aSig0 = extractFloat128Frac0(a);
5821 aSig1 = extractFloat128Frac1(a);
5822 aExp = extractFloat128Exp(a);
5823 aSign = extractFloat128Sign(a);
5824 if (aSign && (aExp > 0x3FFE)) {
5825 float_raise(float_flag_invalid, status);
5826 if (float128_is_any_nan(a)) {
5827 return LIT64(0xFFFFFFFFFFFFFFFF);
5828 } else {
5829 return 0;
5830 }
5831 }
5832 if (aExp) {
5833 aSig0 |= LIT64(0x0001000000000000);
5834 }
5835 shiftCount = 0x402F - aExp;
5836 if (shiftCount <= 0) {
5837 if (0x403E < aExp) {
5838 float_raise(float_flag_invalid, status);
5839 return LIT64(0xFFFFFFFFFFFFFFFF);
5840 }
5841 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5842 } else {
5843 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5844 }
5845 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5846}
5847
5848uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5849{
5850 uint64_t v;
5851 signed char current_rounding_mode = status->float_rounding_mode;
5852
5853 set_float_rounding_mode(float_round_to_zero, status);
5854 v = float128_to_uint64(a, status);
5855 set_float_rounding_mode(current_rounding_mode, status);
5856
5857 return v;
5858}
5859
158142c2
FB
5860/*----------------------------------------------------------------------------
5861| Returns the result of converting the quadruple-precision floating-point
fd425037
BR
5862| value `a' to the 32-bit unsigned integer format. The conversion
5863| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5864| Arithmetic except that the conversion is always rounded toward zero.
5865| If `a' is a NaN, the largest positive integer is returned. Otherwise,
5866| if the conversion overflows, the largest unsigned integer is returned.
5867| If 'a' is negative, the value is rounded and zero is returned; negative
5868| values that do not round to zero will raise the inexact exception.
5869*----------------------------------------------------------------------------*/
5870
5871uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5872{
5873 uint64_t v;
5874 uint32_t res;
5875 int old_exc_flags = get_float_exception_flags(status);
5876
5877 v = float128_to_uint64_round_to_zero(a, status);
5878 if (v > 0xffffffff) {
5879 res = 0xffffffff;
5880 } else {
5881 return v;
5882 }
5883 set_float_exception_flags(old_exc_flags, status);
5884 float_raise(float_flag_invalid, status);
5885 return res;
5886}
5887
5888/*----------------------------------------------------------------------------
5889| Returns the result of converting the quadruple-precision floating-point
158142c2
FB
5890| value `a' to the single-precision floating-point format. The conversion
5891| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5892| Arithmetic.
5893*----------------------------------------------------------------------------*/
5894
e5a41ffa 5895float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
5896{
5897 flag aSign;
f4014512 5898 int32_t aExp;
bb98fe42
AF
5899 uint64_t aSig0, aSig1;
5900 uint32_t zSig;
158142c2
FB
5901
5902 aSig1 = extractFloat128Frac1( a );
5903 aSig0 = extractFloat128Frac0( a );
5904 aExp = extractFloat128Exp( a );
5905 aSign = extractFloat128Sign( a );
5906 if ( aExp == 0x7FFF ) {
5907 if ( aSig0 | aSig1 ) {
ff32e16e 5908 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
5909 }
5910 return packFloat32( aSign, 0xFF, 0 );
5911 }
5912 aSig0 |= ( aSig1 != 0 );
5913 shift64RightJamming( aSig0, 18, &aSig0 );
5914 zSig = aSig0;
5915 if ( aExp || zSig ) {
5916 zSig |= 0x40000000;
5917 aExp -= 0x3F81;
5918 }
ff32e16e 5919 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
5920
5921}
5922
5923/*----------------------------------------------------------------------------
5924| Returns the result of converting the quadruple-precision floating-point
5925| value `a' to the double-precision floating-point format. The conversion
5926| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5927| Arithmetic.
5928*----------------------------------------------------------------------------*/
5929
e5a41ffa 5930float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
5931{
5932 flag aSign;
f4014512 5933 int32_t aExp;
bb98fe42 5934 uint64_t aSig0, aSig1;
158142c2
FB
5935
5936 aSig1 = extractFloat128Frac1( a );
5937 aSig0 = extractFloat128Frac0( a );
5938 aExp = extractFloat128Exp( a );
5939 aSign = extractFloat128Sign( a );
5940 if ( aExp == 0x7FFF ) {
5941 if ( aSig0 | aSig1 ) {
ff32e16e 5942 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
5943 }
5944 return packFloat64( aSign, 0x7FF, 0 );
5945 }
5946 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5947 aSig0 |= ( aSig1 != 0 );
5948 if ( aExp || aSig0 ) {
5949 aSig0 |= LIT64( 0x4000000000000000 );
5950 aExp -= 0x3C01;
5951 }
ff32e16e 5952 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
5953
5954}
5955
158142c2
FB
5956/*----------------------------------------------------------------------------
5957| Returns the result of converting the quadruple-precision floating-point
5958| value `a' to the extended double-precision floating-point format. The
5959| conversion is performed according to the IEC/IEEE Standard for Binary
5960| Floating-Point Arithmetic.
5961*----------------------------------------------------------------------------*/
5962
e5a41ffa 5963floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
5964{
5965 flag aSign;
f4014512 5966 int32_t aExp;
bb98fe42 5967 uint64_t aSig0, aSig1;
158142c2
FB
5968
5969 aSig1 = extractFloat128Frac1( a );
5970 aSig0 = extractFloat128Frac0( a );
5971 aExp = extractFloat128Exp( a );
5972 aSign = extractFloat128Sign( a );
5973 if ( aExp == 0x7FFF ) {
5974 if ( aSig0 | aSig1 ) {
ff32e16e 5975 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2 5976 }
0f605c88
LV
5977 return packFloatx80(aSign, floatx80_infinity_high,
5978 floatx80_infinity_low);
158142c2
FB
5979 }
5980 if ( aExp == 0 ) {
5981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5983 }
5984 else {
5985 aSig0 |= LIT64( 0x0001000000000000 );
5986 }
5987 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 5988 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
5989
5990}
5991
158142c2
FB
5992/*----------------------------------------------------------------------------
5993| Rounds the quadruple-precision floating-point value `a' to an integer, and
5994| returns the result as a quadruple-precision floating-point value. The
5995| operation is performed according to the IEC/IEEE Standard for Binary
5996| Floating-Point Arithmetic.
5997*----------------------------------------------------------------------------*/
5998
e5a41ffa 5999float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
6000{
6001 flag aSign;
f4014512 6002 int32_t aExp;
bb98fe42 6003 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6004 float128 z;
6005
6006 aExp = extractFloat128Exp( a );
6007 if ( 0x402F <= aExp ) {
6008 if ( 0x406F <= aExp ) {
6009 if ( ( aExp == 0x7FFF )
6010 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6011 ) {
ff32e16e 6012 return propagateFloat128NaN(a, a, status);
158142c2
FB
6013 }
6014 return a;
6015 }
6016 lastBitMask = 1;
6017 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6018 roundBitsMask = lastBitMask - 1;
6019 z = a;
a2f2d288 6020 switch (status->float_rounding_mode) {
dc355b76 6021 case float_round_nearest_even:
158142c2
FB
6022 if ( lastBitMask ) {
6023 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6024 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6025 }
6026 else {
bb98fe42 6027 if ( (int64_t) z.low < 0 ) {
158142c2 6028 ++z.high;
bb98fe42 6029 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6030 }
6031 }
dc355b76 6032 break;
f9288a76
PM
6033 case float_round_ties_away:
6034 if (lastBitMask) {
6035 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6036 } else {
6037 if ((int64_t) z.low < 0) {
6038 ++z.high;
6039 }
6040 }
6041 break;
dc355b76
PM
6042 case float_round_to_zero:
6043 break;
6044 case float_round_up:
6045 if (!extractFloat128Sign(z)) {
6046 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6047 }
6048 break;
6049 case float_round_down:
6050 if (extractFloat128Sign(z)) {
6051 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6052 }
dc355b76
PM
6053 break;
6054 default:
6055 abort();
158142c2
FB
6056 }
6057 z.low &= ~ roundBitsMask;
6058 }
6059 else {
6060 if ( aExp < 0x3FFF ) {
bb98fe42 6061 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6062 status->float_exception_flags |= float_flag_inexact;
158142c2 6063 aSign = extractFloat128Sign( a );
a2f2d288 6064 switch (status->float_rounding_mode) {
158142c2
FB
6065 case float_round_nearest_even:
6066 if ( ( aExp == 0x3FFE )
6067 && ( extractFloat128Frac0( a )
6068 | extractFloat128Frac1( a ) )
6069 ) {
6070 return packFloat128( aSign, 0x3FFF, 0, 0 );
6071 }
6072 break;
f9288a76
PM
6073 case float_round_ties_away:
6074 if (aExp == 0x3FFE) {
6075 return packFloat128(aSign, 0x3FFF, 0, 0);
6076 }
6077 break;
158142c2
FB
6078 case float_round_down:
6079 return
6080 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6081 : packFloat128( 0, 0, 0, 0 );
6082 case float_round_up:
6083 return
6084 aSign ? packFloat128( 1, 0, 0, 0 )
6085 : packFloat128( 0, 0x3FFF, 0, 0 );
6086 }
6087 return packFloat128( aSign, 0, 0, 0 );
6088 }
6089 lastBitMask = 1;
6090 lastBitMask <<= 0x402F - aExp;
6091 roundBitsMask = lastBitMask - 1;
6092 z.low = 0;
6093 z.high = a.high;
a2f2d288 6094 switch (status->float_rounding_mode) {
dc355b76 6095 case float_round_nearest_even:
158142c2
FB
6096 z.high += lastBitMask>>1;
6097 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6098 z.high &= ~ lastBitMask;
6099 }
dc355b76 6100 break;
f9288a76
PM
6101 case float_round_ties_away:
6102 z.high += lastBitMask>>1;
6103 break;
dc355b76
PM
6104 case float_round_to_zero:
6105 break;
6106 case float_round_up:
6107 if (!extractFloat128Sign(z)) {
158142c2
FB
6108 z.high |= ( a.low != 0 );
6109 z.high += roundBitsMask;
6110 }
dc355b76
PM
6111 break;
6112 case float_round_down:
6113 if (extractFloat128Sign(z)) {
6114 z.high |= (a.low != 0);
6115 z.high += roundBitsMask;
6116 }
6117 break;
6118 default:
6119 abort();
158142c2
FB
6120 }
6121 z.high &= ~ roundBitsMask;
6122 }
6123 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6124 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6125 }
6126 return z;
6127
6128}
6129
6130/*----------------------------------------------------------------------------
6131| Returns the result of adding the absolute values of the quadruple-precision
6132| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6133| before being returned. `zSign' is ignored if the result is a NaN.
6134| The addition is performed according to the IEC/IEEE Standard for Binary
6135| Floating-Point Arithmetic.
6136*----------------------------------------------------------------------------*/
6137
e5a41ffa
PM
6138static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6139 float_status *status)
158142c2 6140{
f4014512 6141 int32_t aExp, bExp, zExp;
bb98fe42 6142 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6143 int32_t expDiff;
158142c2
FB
6144
6145 aSig1 = extractFloat128Frac1( a );
6146 aSig0 = extractFloat128Frac0( a );
6147 aExp = extractFloat128Exp( a );
6148 bSig1 = extractFloat128Frac1( b );
6149 bSig0 = extractFloat128Frac0( b );
6150 bExp = extractFloat128Exp( b );
6151 expDiff = aExp - bExp;
6152 if ( 0 < expDiff ) {
6153 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6154 if (aSig0 | aSig1) {
6155 return propagateFloat128NaN(a, b, status);
6156 }
158142c2
FB
6157 return a;
6158 }
6159 if ( bExp == 0 ) {
6160 --expDiff;
6161 }
6162 else {
6163 bSig0 |= LIT64( 0x0001000000000000 );
6164 }
6165 shift128ExtraRightJamming(
6166 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6167 zExp = aExp;
6168 }
6169 else if ( expDiff < 0 ) {
6170 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6171 if (bSig0 | bSig1) {
6172 return propagateFloat128NaN(a, b, status);
6173 }
158142c2
FB
6174 return packFloat128( zSign, 0x7FFF, 0, 0 );
6175 }
6176 if ( aExp == 0 ) {
6177 ++expDiff;
6178 }
6179 else {
6180 aSig0 |= LIT64( 0x0001000000000000 );
6181 }
6182 shift128ExtraRightJamming(
6183 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6184 zExp = bExp;
6185 }
6186 else {
6187 if ( aExp == 0x7FFF ) {
6188 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6189 return propagateFloat128NaN(a, b, status);
158142c2
FB
6190 }
6191 return a;
6192 }
6193 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6194 if ( aExp == 0 ) {
a2f2d288 6195 if (status->flush_to_zero) {
e6afc87f 6196 if (zSig0 | zSig1) {
ff32e16e 6197 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6198 }
6199 return packFloat128(zSign, 0, 0, 0);
6200 }
fe76d976
PB
6201 return packFloat128( zSign, 0, zSig0, zSig1 );
6202 }
158142c2
FB
6203 zSig2 = 0;
6204 zSig0 |= LIT64( 0x0002000000000000 );
6205 zExp = aExp;
6206 goto shiftRight1;
6207 }
6208 aSig0 |= LIT64( 0x0001000000000000 );
6209 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6210 --zExp;
6211 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6212 ++zExp;
6213 shiftRight1:
6214 shift128ExtraRightJamming(
6215 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6216 roundAndPack:
ff32e16e 6217 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6218
6219}
6220
6221/*----------------------------------------------------------------------------
6222| Returns the result of subtracting the absolute values of the quadruple-
6223| precision floating-point values `a' and `b'. If `zSign' is 1, the
6224| difference is negated before being returned. `zSign' is ignored if the
6225| result is a NaN. The subtraction is performed according to the IEC/IEEE
6226| Standard for Binary Floating-Point Arithmetic.
6227*----------------------------------------------------------------------------*/
6228
e5a41ffa
PM
6229static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6230 float_status *status)
158142c2 6231{
f4014512 6232 int32_t aExp, bExp, zExp;
bb98fe42 6233 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6234 int32_t expDiff;
158142c2
FB
6235
6236 aSig1 = extractFloat128Frac1( a );
6237 aSig0 = extractFloat128Frac0( a );
6238 aExp = extractFloat128Exp( a );
6239 bSig1 = extractFloat128Frac1( b );
6240 bSig0 = extractFloat128Frac0( b );
6241 bExp = extractFloat128Exp( b );
6242 expDiff = aExp - bExp;
6243 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6244 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6245 if ( 0 < expDiff ) goto aExpBigger;
6246 if ( expDiff < 0 ) goto bExpBigger;
6247 if ( aExp == 0x7FFF ) {
6248 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6249 return propagateFloat128NaN(a, b, status);
158142c2 6250 }
ff32e16e 6251 float_raise(float_flag_invalid, status);
af39bc8c 6252 return float128_default_nan(status);
158142c2
FB
6253 }
6254 if ( aExp == 0 ) {
6255 aExp = 1;
6256 bExp = 1;
6257 }
6258 if ( bSig0 < aSig0 ) goto aBigger;
6259 if ( aSig0 < bSig0 ) goto bBigger;
6260 if ( bSig1 < aSig1 ) goto aBigger;
6261 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6262 return packFloat128(status->float_rounding_mode == float_round_down,
6263 0, 0, 0);
158142c2
FB
6264 bExpBigger:
6265 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6266 if (bSig0 | bSig1) {
6267 return propagateFloat128NaN(a, b, status);
6268 }
158142c2
FB
6269 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6270 }
6271 if ( aExp == 0 ) {
6272 ++expDiff;
6273 }
6274 else {
6275 aSig0 |= LIT64( 0x4000000000000000 );
6276 }
6277 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6278 bSig0 |= LIT64( 0x4000000000000000 );
6279 bBigger:
6280 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6281 zExp = bExp;
6282 zSign ^= 1;
6283 goto normalizeRoundAndPack;
6284 aExpBigger:
6285 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6286 if (aSig0 | aSig1) {
6287 return propagateFloat128NaN(a, b, status);
6288 }
158142c2
FB
6289 return a;
6290 }
6291 if ( bExp == 0 ) {
6292 --expDiff;
6293 }
6294 else {
6295 bSig0 |= LIT64( 0x4000000000000000 );
6296 }
6297 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6298 aSig0 |= LIT64( 0x4000000000000000 );
6299 aBigger:
6300 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6301 zExp = aExp;
6302 normalizeRoundAndPack:
6303 --zExp;
ff32e16e
PM
6304 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6305 status);
158142c2
FB
6306
6307}
6308
6309/*----------------------------------------------------------------------------
6310| Returns the result of adding the quadruple-precision floating-point values
6311| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6312| for Binary Floating-Point Arithmetic.
6313*----------------------------------------------------------------------------*/
6314
e5a41ffa 6315float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6316{
6317 flag aSign, bSign;
6318
6319 aSign = extractFloat128Sign( a );
6320 bSign = extractFloat128Sign( b );
6321 if ( aSign == bSign ) {
ff32e16e 6322 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6323 }
6324 else {
ff32e16e 6325 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6326 }
6327
6328}
6329
6330/*----------------------------------------------------------------------------
6331| Returns the result of subtracting the quadruple-precision floating-point
6332| values `a' and `b'. The operation is performed according to the IEC/IEEE
6333| Standard for Binary Floating-Point Arithmetic.
6334*----------------------------------------------------------------------------*/
6335
e5a41ffa 6336float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6337{
6338 flag aSign, bSign;
6339
6340 aSign = extractFloat128Sign( a );
6341 bSign = extractFloat128Sign( b );
6342 if ( aSign == bSign ) {
ff32e16e 6343 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6344 }
6345 else {
ff32e16e 6346 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6347 }
6348
6349}
6350
6351/*----------------------------------------------------------------------------
6352| Returns the result of multiplying the quadruple-precision floating-point
6353| values `a' and `b'. The operation is performed according to the IEC/IEEE
6354| Standard for Binary Floating-Point Arithmetic.
6355*----------------------------------------------------------------------------*/
6356
e5a41ffa 6357float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6358{
6359 flag aSign, bSign, zSign;
f4014512 6360 int32_t aExp, bExp, zExp;
bb98fe42 6361 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6362
6363 aSig1 = extractFloat128Frac1( a );
6364 aSig0 = extractFloat128Frac0( a );
6365 aExp = extractFloat128Exp( a );
6366 aSign = extractFloat128Sign( a );
6367 bSig1 = extractFloat128Frac1( b );
6368 bSig0 = extractFloat128Frac0( b );
6369 bExp = extractFloat128Exp( b );
6370 bSign = extractFloat128Sign( b );
6371 zSign = aSign ^ bSign;
6372 if ( aExp == 0x7FFF ) {
6373 if ( ( aSig0 | aSig1 )
6374 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6375 return propagateFloat128NaN(a, b, status);
158142c2
FB
6376 }
6377 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6378 return packFloat128( zSign, 0x7FFF, 0, 0 );
6379 }
6380 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6381 if (bSig0 | bSig1) {
6382 return propagateFloat128NaN(a, b, status);
6383 }
158142c2
FB
6384 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6385 invalid:
ff32e16e 6386 float_raise(float_flag_invalid, status);
af39bc8c 6387 return float128_default_nan(status);
158142c2
FB
6388 }
6389 return packFloat128( zSign, 0x7FFF, 0, 0 );
6390 }
6391 if ( aExp == 0 ) {
6392 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6393 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6394 }
6395 if ( bExp == 0 ) {
6396 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6397 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6398 }
6399 zExp = aExp + bExp - 0x4000;
6400 aSig0 |= LIT64( 0x0001000000000000 );
6401 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6402 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6403 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6404 zSig2 |= ( zSig3 != 0 );
6405 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6406 shift128ExtraRightJamming(
6407 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6408 ++zExp;
6409 }
ff32e16e 6410 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6411
6412}
6413
6414/*----------------------------------------------------------------------------
6415| Returns the result of dividing the quadruple-precision floating-point value
6416| `a' by the corresponding value `b'. The operation is performed according to
6417| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6418*----------------------------------------------------------------------------*/
6419
e5a41ffa 6420float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6421{
6422 flag aSign, bSign, zSign;
f4014512 6423 int32_t aExp, bExp, zExp;
bb98fe42
AF
6424 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6425 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6426
6427 aSig1 = extractFloat128Frac1( a );
6428 aSig0 = extractFloat128Frac0( a );
6429 aExp = extractFloat128Exp( a );
6430 aSign = extractFloat128Sign( a );
6431 bSig1 = extractFloat128Frac1( b );
6432 bSig0 = extractFloat128Frac0( b );
6433 bExp = extractFloat128Exp( b );
6434 bSign = extractFloat128Sign( b );
6435 zSign = aSign ^ bSign;
6436 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6437 if (aSig0 | aSig1) {
6438 return propagateFloat128NaN(a, b, status);
6439 }
158142c2 6440 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6441 if (bSig0 | bSig1) {
6442 return propagateFloat128NaN(a, b, status);
6443 }
158142c2
FB
6444 goto invalid;
6445 }
6446 return packFloat128( zSign, 0x7FFF, 0, 0 );
6447 }
6448 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6449 if (bSig0 | bSig1) {
6450 return propagateFloat128NaN(a, b, status);
6451 }
158142c2
FB
6452 return packFloat128( zSign, 0, 0, 0 );
6453 }
6454 if ( bExp == 0 ) {
6455 if ( ( bSig0 | bSig1 ) == 0 ) {
6456 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6457 invalid:
ff32e16e 6458 float_raise(float_flag_invalid, status);
af39bc8c 6459 return float128_default_nan(status);
158142c2 6460 }
ff32e16e 6461 float_raise(float_flag_divbyzero, status);
158142c2
FB
6462 return packFloat128( zSign, 0x7FFF, 0, 0 );
6463 }
6464 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6465 }
6466 if ( aExp == 0 ) {
6467 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6468 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6469 }
6470 zExp = aExp - bExp + 0x3FFD;
6471 shortShift128Left(
6472 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6473 shortShift128Left(
6474 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6475 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6476 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6477 ++zExp;
6478 }
6479 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6480 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6481 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6482 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6483 --zSig0;
6484 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6485 }
6486 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6487 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6488 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6489 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6490 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6491 --zSig1;
6492 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6493 }
6494 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6495 }
6496 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6497 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6498
6499}
6500
6501/*----------------------------------------------------------------------------
6502| Returns the remainder of the quadruple-precision floating-point value `a'
6503| with respect to the corresponding value `b'. The operation is performed
6504| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6505*----------------------------------------------------------------------------*/
6506
e5a41ffa 6507float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6508{
ed086f3d 6509 flag aSign, zSign;
f4014512 6510 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6511 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6512 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6513 int64_t sigMean0;
158142c2
FB
6514
6515 aSig1 = extractFloat128Frac1( a );
6516 aSig0 = extractFloat128Frac0( a );
6517 aExp = extractFloat128Exp( a );
6518 aSign = extractFloat128Sign( a );
6519 bSig1 = extractFloat128Frac1( b );
6520 bSig0 = extractFloat128Frac0( b );
6521 bExp = extractFloat128Exp( b );
158142c2
FB
6522 if ( aExp == 0x7FFF ) {
6523 if ( ( aSig0 | aSig1 )
6524 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6525 return propagateFloat128NaN(a, b, status);
158142c2
FB
6526 }
6527 goto invalid;
6528 }
6529 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6530 if (bSig0 | bSig1) {
6531 return propagateFloat128NaN(a, b, status);
6532 }
158142c2
FB
6533 return a;
6534 }
6535 if ( bExp == 0 ) {
6536 if ( ( bSig0 | bSig1 ) == 0 ) {
6537 invalid:
ff32e16e 6538 float_raise(float_flag_invalid, status);
af39bc8c 6539 return float128_default_nan(status);
158142c2
FB
6540 }
6541 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6542 }
6543 if ( aExp == 0 ) {
6544 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6545 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6546 }
6547 expDiff = aExp - bExp;
6548 if ( expDiff < -1 ) return a;
6549 shortShift128Left(
6550 aSig0 | LIT64( 0x0001000000000000 ),
6551 aSig1,
6552 15 - ( expDiff < 0 ),
6553 &aSig0,
6554 &aSig1
6555 );
6556 shortShift128Left(
6557 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6558 q = le128( bSig0, bSig1, aSig0, aSig1 );
6559 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6560 expDiff -= 64;
6561 while ( 0 < expDiff ) {
6562 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6563 q = ( 4 < q ) ? q - 4 : 0;
6564 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6565 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6566 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6567 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6568 expDiff -= 61;
6569 }
6570 if ( -64 < expDiff ) {
6571 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6572 q = ( 4 < q ) ? q - 4 : 0;
6573 q >>= - expDiff;
6574 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6575 expDiff += 52;
6576 if ( expDiff < 0 ) {
6577 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6578 }
6579 else {
6580 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6581 }
6582 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6583 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6584 }
6585 else {
6586 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6587 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6588 }
6589 do {
6590 alternateASig0 = aSig0;
6591 alternateASig1 = aSig1;
6592 ++q;
6593 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6594 } while ( 0 <= (int64_t) aSig0 );
158142c2 6595 add128(
bb98fe42 6596 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6597 if ( ( sigMean0 < 0 )
6598 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6599 aSig0 = alternateASig0;
6600 aSig1 = alternateASig1;
6601 }
bb98fe42 6602 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6603 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6604 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6605 status);
158142c2
FB
6606}
6607
6608/*----------------------------------------------------------------------------
6609| Returns the square root of the quadruple-precision floating-point value `a'.
6610| The operation is performed according to the IEC/IEEE Standard for Binary
6611| Floating-Point Arithmetic.
6612*----------------------------------------------------------------------------*/
6613
e5a41ffa 6614float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6615{
6616 flag aSign;
f4014512 6617 int32_t aExp, zExp;
bb98fe42
AF
6618 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6619 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6620
6621 aSig1 = extractFloat128Frac1( a );
6622 aSig0 = extractFloat128Frac0( a );
6623 aExp = extractFloat128Exp( a );
6624 aSign = extractFloat128Sign( a );
6625 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6626 if (aSig0 | aSig1) {
6627 return propagateFloat128NaN(a, a, status);
6628 }
158142c2
FB
6629 if ( ! aSign ) return a;
6630 goto invalid;
6631 }
6632 if ( aSign ) {
6633 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6634 invalid:
ff32e16e 6635 float_raise(float_flag_invalid, status);
af39bc8c 6636 return float128_default_nan(status);
158142c2
FB
6637 }
6638 if ( aExp == 0 ) {
6639 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6640 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6641 }
6642 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6643 aSig0 |= LIT64( 0x0001000000000000 );
6644 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6645 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6646 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6647 doubleZSig0 = zSig0<<1;
6648 mul64To128( zSig0, zSig0, &term0, &term1 );
6649 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6650 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6651 --zSig0;
6652 doubleZSig0 -= 2;
6653 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6654 }
6655 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6656 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6657 if ( zSig1 == 0 ) zSig1 = 1;
6658 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6659 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6660 mul64To128( zSig1, zSig1, &term2, &term3 );
6661 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6662 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6663 --zSig1;
6664 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6665 term3 |= 1;
6666 term2 |= doubleZSig0;
6667 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6668 }
6669 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6670 }
6671 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6672 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6673
6674}
6675
6676/*----------------------------------------------------------------------------
6677| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6678| the corresponding value `b', and 0 otherwise. The invalid exception is
6679| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6680| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6681*----------------------------------------------------------------------------*/
6682
e5a41ffa 6683int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6684{
6685
6686 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6687 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6688 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6689 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6690 ) {
ff32e16e 6691 float_raise(float_flag_invalid, status);
158142c2
FB
6692 return 0;
6693 }
6694 return
6695 ( a.low == b.low )
6696 && ( ( a.high == b.high )
6697 || ( ( a.low == 0 )
bb98fe42 6698 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6699 );
6700
6701}
6702
6703/*----------------------------------------------------------------------------
6704| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6705| or equal to the corresponding value `b', and 0 otherwise. The invalid
6706| exception is raised if either operand is a NaN. The comparison is performed
6707| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6708*----------------------------------------------------------------------------*/
6709
e5a41ffa 6710int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6711{
6712 flag aSign, bSign;
6713
6714 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6715 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6716 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6717 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6718 ) {
ff32e16e 6719 float_raise(float_flag_invalid, status);
158142c2
FB
6720 return 0;
6721 }
6722 aSign = extractFloat128Sign( a );
6723 bSign = extractFloat128Sign( b );
6724 if ( aSign != bSign ) {
6725 return
6726 aSign
bb98fe42 6727 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6728 == 0 );
6729 }
6730 return
6731 aSign ? le128( b.high, b.low, a.high, a.low )
6732 : le128( a.high, a.low, b.high, b.low );
6733
6734}
6735
6736/*----------------------------------------------------------------------------
6737| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6738| the corresponding value `b', and 0 otherwise. The invalid exception is
6739| raised if either operand is a NaN. The comparison is performed according
6740| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6741*----------------------------------------------------------------------------*/
6742
e5a41ffa 6743int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6744{
6745 flag aSign, bSign;
6746
6747 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6748 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6749 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6750 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6751 ) {
ff32e16e 6752 float_raise(float_flag_invalid, status);
158142c2
FB
6753 return 0;
6754 }
6755 aSign = extractFloat128Sign( a );
6756 bSign = extractFloat128Sign( b );
6757 if ( aSign != bSign ) {
6758 return
6759 aSign
bb98fe42 6760 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6761 != 0 );
6762 }
6763 return
6764 aSign ? lt128( b.high, b.low, a.high, a.low )
6765 : lt128( a.high, a.low, b.high, b.low );
6766
6767}
6768
67b7861d
AJ
6769/*----------------------------------------------------------------------------
6770| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6771| be compared, and 0 otherwise. The invalid exception is raised if either
6772| operand is a NaN. The comparison is performed according to the IEC/IEEE
6773| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6774*----------------------------------------------------------------------------*/
6775
e5a41ffa 6776int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6777{
6778 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6779 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6780 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6781 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6782 ) {
ff32e16e 6783 float_raise(float_flag_invalid, status);
67b7861d
AJ
6784 return 1;
6785 }
6786 return 0;
6787}
6788
158142c2
FB
6789/*----------------------------------------------------------------------------
6790| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6791| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6792| exception. The comparison is performed according to the IEC/IEEE Standard
6793| for Binary Floating-Point Arithmetic.
158142c2
FB
6794*----------------------------------------------------------------------------*/
6795
e5a41ffa 6796int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6797{
6798
6799 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6800 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6801 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6802 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6803 ) {
af39bc8c
AM
6804 if (float128_is_signaling_nan(a, status)
6805 || float128_is_signaling_nan(b, status)) {
ff32e16e 6806 float_raise(float_flag_invalid, status);
b689362d 6807 }
158142c2
FB
6808 return 0;
6809 }
6810 return
6811 ( a.low == b.low )
6812 && ( ( a.high == b.high )
6813 || ( ( a.low == 0 )
bb98fe42 6814 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6815 );
6816
6817}
6818
6819/*----------------------------------------------------------------------------
6820| Returns 1 if the quadruple-precision floating-point value `a' is less than
6821| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6822| cause an exception. Otherwise, the comparison is performed according to the
6823| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6824*----------------------------------------------------------------------------*/
6825
e5a41ffa 6826int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6827{
6828 flag aSign, bSign;
6829
6830 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6831 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6832 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6833 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6834 ) {
af39bc8c
AM
6835 if (float128_is_signaling_nan(a, status)
6836 || float128_is_signaling_nan(b, status)) {
ff32e16e 6837 float_raise(float_flag_invalid, status);
158142c2
FB
6838 }
6839 return 0;
6840 }
6841 aSign = extractFloat128Sign( a );
6842 bSign = extractFloat128Sign( b );
6843 if ( aSign != bSign ) {
6844 return
6845 aSign
bb98fe42 6846 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6847 == 0 );
6848 }
6849 return
6850 aSign ? le128( b.high, b.low, a.high, a.low )
6851 : le128( a.high, a.low, b.high, b.low );
6852
6853}
6854
6855/*----------------------------------------------------------------------------
6856| Returns 1 if the quadruple-precision floating-point value `a' is less than
6857| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6858| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6859| Standard for Binary Floating-Point Arithmetic.
6860*----------------------------------------------------------------------------*/
6861
e5a41ffa 6862int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6863{
6864 flag aSign, bSign;
6865
6866 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6867 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6868 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6869 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6870 ) {
af39bc8c
AM
6871 if (float128_is_signaling_nan(a, status)
6872 || float128_is_signaling_nan(b, status)) {
ff32e16e 6873 float_raise(float_flag_invalid, status);
158142c2
FB
6874 }
6875 return 0;
6876 }
6877 aSign = extractFloat128Sign( a );
6878 bSign = extractFloat128Sign( b );
6879 if ( aSign != bSign ) {
6880 return
6881 aSign
bb98fe42 6882 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6883 != 0 );
6884 }
6885 return
6886 aSign ? lt128( b.high, b.low, a.high, a.low )
6887 : lt128( a.high, a.low, b.high, b.low );
6888
6889}
6890
67b7861d
AJ
6891/*----------------------------------------------------------------------------
6892| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6893| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6894| comparison is performed according to the IEC/IEEE Standard for Binary
6895| Floating-Point Arithmetic.
6896*----------------------------------------------------------------------------*/
6897
e5a41ffa 6898int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
6899{
6900 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6901 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6902 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6903 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6904 ) {
af39bc8c
AM
6905 if (float128_is_signaling_nan(a, status)
6906 || float128_is_signaling_nan(b, status)) {
ff32e16e 6907 float_raise(float_flag_invalid, status);
67b7861d
AJ
6908 }
6909 return 1;
6910 }
6911 return 0;
6912}
6913
e5a41ffa
PM
6914static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6915 int is_quiet, float_status *status)
f6714d36
AJ
6916{
6917 flag aSign, bSign;
6918
d1eb8f2a
AD
6919 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6920 float_raise(float_flag_invalid, status);
6921 return float_relation_unordered;
6922 }
f6714d36
AJ
6923 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6924 ( extractFloatx80Frac( a )<<1 ) ) ||
6925 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6926 ( extractFloatx80Frac( b )<<1 ) )) {
6927 if (!is_quiet ||
af39bc8c
AM
6928 floatx80_is_signaling_nan(a, status) ||
6929 floatx80_is_signaling_nan(b, status)) {
ff32e16e 6930 float_raise(float_flag_invalid, status);
f6714d36
AJ
6931 }
6932 return float_relation_unordered;
6933 }
6934 aSign = extractFloatx80Sign( a );
6935 bSign = extractFloatx80Sign( b );
6936 if ( aSign != bSign ) {
6937
6938 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6939 ( ( a.low | b.low ) == 0 ) ) {
6940 /* zero case */
6941 return float_relation_equal;
6942 } else {
6943 return 1 - (2 * aSign);
6944 }
6945 } else {
6946 if (a.low == b.low && a.high == b.high) {
6947 return float_relation_equal;
6948 } else {
6949 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6950 }
6951 }
6952}
6953
e5a41ffa 6954int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 6955{
ff32e16e 6956 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
6957}
6958
e5a41ffa 6959int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 6960{
ff32e16e 6961 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
6962}
6963
e5a41ffa
PM
6964static inline int float128_compare_internal(float128 a, float128 b,
6965 int is_quiet, float_status *status)
1f587329
BS
6966{
6967 flag aSign, bSign;
6968
6969 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6970 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6971 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6972 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6973 if (!is_quiet ||
af39bc8c
AM
6974 float128_is_signaling_nan(a, status) ||
6975 float128_is_signaling_nan(b, status)) {
ff32e16e 6976 float_raise(float_flag_invalid, status);
1f587329
BS
6977 }
6978 return float_relation_unordered;
6979 }
6980 aSign = extractFloat128Sign( a );
6981 bSign = extractFloat128Sign( b );
6982 if ( aSign != bSign ) {
6983 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6984 /* zero case */
6985 return float_relation_equal;
6986 } else {
6987 return 1 - (2 * aSign);
6988 }
6989 } else {
6990 if (a.low == b.low && a.high == b.high) {
6991 return float_relation_equal;
6992 } else {
6993 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6994 }
6995 }
6996}
6997
e5a41ffa 6998int float128_compare(float128 a, float128 b, float_status *status)
1f587329 6999{
ff32e16e 7000 return float128_compare_internal(a, b, 0, status);
1f587329
BS
7001}
7002
e5a41ffa 7003int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 7004{
ff32e16e 7005 return float128_compare_internal(a, b, 1, status);
1f587329
BS
7006}
7007
e5a41ffa 7008floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
7009{
7010 flag aSign;
326b9e98 7011 int32_t aExp;
bb98fe42 7012 uint64_t aSig;
9ee6e8bb 7013
d1eb8f2a
AD
7014 if (floatx80_invalid_encoding(a)) {
7015 float_raise(float_flag_invalid, status);
7016 return floatx80_default_nan(status);
7017 }
9ee6e8bb
PB
7018 aSig = extractFloatx80Frac( a );
7019 aExp = extractFloatx80Exp( a );
7020 aSign = extractFloatx80Sign( a );
7021
326b9e98
AJ
7022 if ( aExp == 0x7FFF ) {
7023 if ( aSig<<1 ) {
ff32e16e 7024 return propagateFloatx80NaN(a, a, status);
326b9e98 7025 }
9ee6e8bb
PB
7026 return a;
7027 }
326b9e98 7028
3c85c37f
PM
7029 if (aExp == 0) {
7030 if (aSig == 0) {
7031 return a;
7032 }
7033 aExp++;
7034 }
69397542 7035
326b9e98
AJ
7036 if (n > 0x10000) {
7037 n = 0x10000;
7038 } else if (n < -0x10000) {
7039 n = -0x10000;
7040 }
7041
9ee6e8bb 7042 aExp += n;
a2f2d288
PM
7043 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7044 aSign, aExp, aSig, 0, status);
9ee6e8bb 7045}
9ee6e8bb 7046
e5a41ffa 7047float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7048{
7049 flag aSign;
326b9e98 7050 int32_t aExp;
bb98fe42 7051 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7052
7053 aSig1 = extractFloat128Frac1( a );
7054 aSig0 = extractFloat128Frac0( a );
7055 aExp = extractFloat128Exp( a );
7056 aSign = extractFloat128Sign( a );
7057 if ( aExp == 0x7FFF ) {
326b9e98 7058 if ( aSig0 | aSig1 ) {
ff32e16e 7059 return propagateFloat128NaN(a, a, status);
326b9e98 7060 }
9ee6e8bb
PB
7061 return a;
7062 }
3c85c37f 7063 if (aExp != 0) {
69397542 7064 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7065 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7066 return a;
3c85c37f
PM
7067 } else {
7068 aExp++;
7069 }
69397542 7070
326b9e98
AJ
7071 if (n > 0x10000) {
7072 n = 0x10000;
7073 } else if (n < -0x10000) {
7074 n = -0x10000;
7075 }
7076
69397542
PB
7077 aExp += n - 1;
7078 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7079 , status);
9ee6e8bb
PB
7080
7081}
This page took 1.686246 seconds and 4 git commands to generate.