]> Git Repo - qemu.git/blame - fpu/softfloat.c
fpu/softfloat: Partial support for ARM Alternative half-precision
[qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
d38ea87a 85#include "qemu/osdep.h"
6fff2167 86#include "qemu/bitops.h"
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76 89/* We only need stdlib for abort() */
dc355b76 90
158142c2
FB
91/*----------------------------------------------------------------------------
92| Primitive arithmetic functions, including multi-word arithmetic, and
93| division and square root approximations. (Can be specialized to target if
94| desired.)
95*----------------------------------------------------------------------------*/
88857aca 96#include "fpu/softfloat-macros.h"
158142c2 97
bb4d4bb3
PM
98/*----------------------------------------------------------------------------
99| Returns the fraction bits of the half-precision floating-point value `a'.
100*----------------------------------------------------------------------------*/
101
a49db98d 102static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
103{
104 return float16_val(a) & 0x3ff;
105}
106
107/*----------------------------------------------------------------------------
108| Returns the exponent bits of the half-precision floating-point value `a'.
109*----------------------------------------------------------------------------*/
110
0c48262d 111static inline int extractFloat16Exp(float16 a)
bb4d4bb3
PM
112{
113 return (float16_val(a) >> 10) & 0x1f;
114}
115
116/*----------------------------------------------------------------------------
117| Returns the sign bit of the single-precision floating-point value `a'.
118*----------------------------------------------------------------------------*/
119
a49db98d 120static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
121{
122 return float16_val(a)>>15;
123}
124
d97544c9
AB
125/*----------------------------------------------------------------------------
126| Returns the fraction bits of the single-precision floating-point value `a'.
127*----------------------------------------------------------------------------*/
128
129static inline uint32_t extractFloat32Frac(float32 a)
130{
131 return float32_val(a) & 0x007FFFFF;
132}
133
134/*----------------------------------------------------------------------------
135| Returns the exponent bits of the single-precision floating-point value `a'.
136*----------------------------------------------------------------------------*/
137
138static inline int extractFloat32Exp(float32 a)
139{
140 return (float32_val(a) >> 23) & 0xFF;
141}
142
143/*----------------------------------------------------------------------------
144| Returns the sign bit of the single-precision floating-point value `a'.
145*----------------------------------------------------------------------------*/
146
147static inline flag extractFloat32Sign(float32 a)
148{
149 return float32_val(a) >> 31;
150}
151
152/*----------------------------------------------------------------------------
153| Returns the fraction bits of the double-precision floating-point value `a'.
154*----------------------------------------------------------------------------*/
155
156static inline uint64_t extractFloat64Frac(float64 a)
157{
158 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
159}
160
161/*----------------------------------------------------------------------------
162| Returns the exponent bits of the double-precision floating-point value `a'.
163*----------------------------------------------------------------------------*/
164
165static inline int extractFloat64Exp(float64 a)
166{
167 return (float64_val(a) >> 52) & 0x7FF;
168}
169
170/*----------------------------------------------------------------------------
171| Returns the sign bit of the double-precision floating-point value `a'.
172*----------------------------------------------------------------------------*/
173
174static inline flag extractFloat64Sign(float64 a)
175{
176 return float64_val(a) >> 63;
177}
178
a90119b5
AB
179/*
180 * Classify a floating point number. Everything above float_class_qnan
181 * is a NaN so cls >= float_class_qnan is any NaN.
182 */
183
184typedef enum __attribute__ ((__packed__)) {
185 float_class_unclassified,
186 float_class_zero,
187 float_class_normal,
188 float_class_inf,
189 float_class_qnan, /* all NaNs from here */
190 float_class_snan,
a90119b5
AB
191} FloatClass;
192
193/*
194 * Structure holding all of the decomposed parts of a float. The
195 * exponent is unbiased and the fraction is normalized. All
196 * calculations are done with a 64 bit fraction and then rounded as
197 * appropriate for the final format.
198 *
199 * Thanks to the packed FloatClass a decent compiler should be able to
200 * fit the whole structure into registers and avoid using the stack
201 * for parameter passing.
202 */
203
204typedef struct {
205 uint64_t frac;
206 int32_t exp;
207 FloatClass cls;
208 bool sign;
209} FloatParts;
210
211#define DECOMPOSED_BINARY_POINT (64 - 2)
212#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
213#define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
214
215/* Structure holding all of the relevant parameters for a format.
216 * exp_size: the size of the exponent field
217 * exp_bias: the offset applied to the exponent field
218 * exp_max: the maximum normalised exponent
219 * frac_size: the size of the fraction field
220 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
221 * The following are computed based the size of fraction
222 * frac_lsb: least significant bit of fraction
ca3a3d5a 223 * frac_lsbm1: the bit below the least significant bit (for rounding)
a90119b5 224 * round_mask/roundeven_mask: masks used for rounding
ca3a3d5a
AB
225 * The following optional modifiers are available:
226 * arm_althp: handle ARM Alternative Half Precision
a90119b5
AB
227 */
228typedef struct {
229 int exp_size;
230 int exp_bias;
231 int exp_max;
232 int frac_size;
233 int frac_shift;
234 uint64_t frac_lsb;
235 uint64_t frac_lsbm1;
236 uint64_t round_mask;
237 uint64_t roundeven_mask;
ca3a3d5a 238 bool arm_althp;
a90119b5
AB
239} FloatFmt;
240
241/* Expand fields based on the size of exponent and fraction */
242#define FLOAT_PARAMS(E, F) \
243 .exp_size = E, \
244 .exp_bias = ((1 << E) - 1) >> 1, \
245 .exp_max = (1 << E) - 1, \
246 .frac_size = F, \
247 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
248 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
249 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
250 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
251 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
252
253static const FloatFmt float16_params = {
254 FLOAT_PARAMS(5, 10)
255};
256
257static const FloatFmt float32_params = {
258 FLOAT_PARAMS(8, 23)
259};
260
261static const FloatFmt float64_params = {
262 FLOAT_PARAMS(11, 52)
263};
264
6fff2167
AB
265/* Unpack a float to parts, but do not canonicalize. */
266static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
267{
268 const int sign_pos = fmt.frac_size + fmt.exp_size;
269
270 return (FloatParts) {
271 .cls = float_class_unclassified,
272 .sign = extract64(raw, sign_pos, 1),
273 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
274 .frac = extract64(raw, 0, fmt.frac_size),
275 };
276}
277
278static inline FloatParts float16_unpack_raw(float16 f)
279{
280 return unpack_raw(float16_params, f);
281}
282
283static inline FloatParts float32_unpack_raw(float32 f)
284{
285 return unpack_raw(float32_params, f);
286}
287
288static inline FloatParts float64_unpack_raw(float64 f)
289{
290 return unpack_raw(float64_params, f);
291}
292
293/* Pack a float from parts, but do not canonicalize. */
294static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
295{
296 const int sign_pos = fmt.frac_size + fmt.exp_size;
297 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
298 return deposit64(ret, sign_pos, 1, p.sign);
299}
300
301static inline float16 float16_pack_raw(FloatParts p)
302{
303 return make_float16(pack_raw(float16_params, p));
304}
305
306static inline float32 float32_pack_raw(FloatParts p)
307{
308 return make_float32(pack_raw(float32_params, p));
309}
310
311static inline float64 float64_pack_raw(FloatParts p)
312{
313 return make_float64(pack_raw(float64_params, p));
314}
315
0664335a
RH
316/*----------------------------------------------------------------------------
317| Functions and definitions to determine: (1) whether tininess for underflow
318| is detected before or after rounding by default, (2) what (if anything)
319| happens when exceptions are raised, (3) how signaling NaNs are distinguished
320| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
321| are propagated from function inputs to output. These details are target-
322| specific.
323*----------------------------------------------------------------------------*/
324#include "softfloat-specialize.h"
325
6fff2167
AB
326/* Canonicalize EXP and FRAC, setting CLS. */
327static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
328 float_status *status)
329{
ca3a3d5a 330 if (part.exp == parm->exp_max && !parm->arm_althp) {
6fff2167
AB
331 if (part.frac == 0) {
332 part.cls = float_class_inf;
333 } else {
94933df0 334 part.frac <<= parm->frac_shift;
298b468e
RH
335 part.cls = (parts_is_snan_frac(part.frac, status)
336 ? float_class_snan : float_class_qnan);
6fff2167
AB
337 }
338 } else if (part.exp == 0) {
339 if (likely(part.frac == 0)) {
340 part.cls = float_class_zero;
341 } else if (status->flush_inputs_to_zero) {
342 float_raise(float_flag_input_denormal, status);
343 part.cls = float_class_zero;
344 part.frac = 0;
345 } else {
346 int shift = clz64(part.frac) - 1;
347 part.cls = float_class_normal;
348 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
349 part.frac <<= shift;
350 }
351 } else {
352 part.cls = float_class_normal;
353 part.exp -= parm->exp_bias;
354 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
355 }
356 return part;
357}
358
359/* Round and uncanonicalize a floating-point number by parts. There
360 * are FRAC_SHIFT bits that may require rounding at the bottom of the
361 * fraction; these bits will be removed. The exponent will be biased
362 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
363 */
364
365static FloatParts round_canonical(FloatParts p, float_status *s,
366 const FloatFmt *parm)
367{
368 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
369 const uint64_t round_mask = parm->round_mask;
370 const uint64_t roundeven_mask = parm->roundeven_mask;
371 const int exp_max = parm->exp_max;
372 const int frac_shift = parm->frac_shift;
373 uint64_t frac, inc;
374 int exp, flags = 0;
375 bool overflow_norm;
376
377 frac = p.frac;
378 exp = p.exp;
379
380 switch (p.cls) {
381 case float_class_normal:
382 switch (s->float_rounding_mode) {
383 case float_round_nearest_even:
384 overflow_norm = false;
385 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
386 break;
387 case float_round_ties_away:
388 overflow_norm = false;
389 inc = frac_lsbm1;
390 break;
391 case float_round_to_zero:
392 overflow_norm = true;
393 inc = 0;
394 break;
395 case float_round_up:
396 inc = p.sign ? 0 : round_mask;
397 overflow_norm = p.sign;
398 break;
399 case float_round_down:
400 inc = p.sign ? round_mask : 0;
401 overflow_norm = !p.sign;
402 break;
403 default:
404 g_assert_not_reached();
405 }
406
407 exp += parm->exp_bias;
408 if (likely(exp > 0)) {
409 if (frac & round_mask) {
410 flags |= float_flag_inexact;
411 frac += inc;
412 if (frac & DECOMPOSED_OVERFLOW_BIT) {
413 frac >>= 1;
414 exp++;
415 }
416 }
417 frac >>= frac_shift;
418
ca3a3d5a
AB
419 if (parm->arm_althp) {
420 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
421 if (unlikely(exp > exp_max)) {
422 /* Overflow. Return the maximum normal. */
423 flags = float_flag_invalid;
424 exp = exp_max;
425 frac = -1;
426 }
427 } else if (unlikely(exp >= exp_max)) {
6fff2167
AB
428 flags |= float_flag_overflow | float_flag_inexact;
429 if (overflow_norm) {
430 exp = exp_max - 1;
431 frac = -1;
432 } else {
433 p.cls = float_class_inf;
434 goto do_inf;
435 }
436 }
437 } else if (s->flush_to_zero) {
438 flags |= float_flag_output_denormal;
439 p.cls = float_class_zero;
440 goto do_zero;
441 } else {
442 bool is_tiny = (s->float_detect_tininess
443 == float_tininess_before_rounding)
444 || (exp < 0)
445 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
446
447 shift64RightJamming(frac, 1 - exp, &frac);
448 if (frac & round_mask) {
449 /* Need to recompute round-to-even. */
450 if (s->float_rounding_mode == float_round_nearest_even) {
451 inc = ((frac & roundeven_mask) != frac_lsbm1
452 ? frac_lsbm1 : 0);
453 }
454 flags |= float_flag_inexact;
455 frac += inc;
456 }
457
458 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
459 frac >>= frac_shift;
460
461 if (is_tiny && (flags & float_flag_inexact)) {
462 flags |= float_flag_underflow;
463 }
464 if (exp == 0 && frac == 0) {
465 p.cls = float_class_zero;
466 }
467 }
468 break;
469
470 case float_class_zero:
471 do_zero:
472 exp = 0;
473 frac = 0;
474 break;
475
476 case float_class_inf:
477 do_inf:
ca3a3d5a 478 assert(!parm->arm_althp);
6fff2167
AB
479 exp = exp_max;
480 frac = 0;
481 break;
482
483 case float_class_qnan:
484 case float_class_snan:
ca3a3d5a 485 assert(!parm->arm_althp);
6fff2167 486 exp = exp_max;
94933df0 487 frac >>= parm->frac_shift;
6fff2167
AB
488 break;
489
490 default:
491 g_assert_not_reached();
492 }
493
494 float_raise(flags, s);
495 p.exp = exp;
496 p.frac = frac;
497 return p;
498}
499
500static FloatParts float16_unpack_canonical(float16 f, float_status *s)
501{
502 return canonicalize(float16_unpack_raw(f), &float16_params, s);
503}
504
505static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
506{
0bcfbcbe 507 return float16_pack_raw(round_canonical(p, s, &float16_params));
6fff2167
AB
508}
509
510static FloatParts float32_unpack_canonical(float32 f, float_status *s)
511{
512 return canonicalize(float32_unpack_raw(f), &float32_params, s);
513}
514
515static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
516{
0bcfbcbe 517 return float32_pack_raw(round_canonical(p, s, &float32_params));
6fff2167
AB
518}
519
520static FloatParts float64_unpack_canonical(float64 f, float_status *s)
521{
522 return canonicalize(float64_unpack_raw(f), &float64_params, s);
523}
524
525static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
526{
0bcfbcbe 527 return float64_pack_raw(round_canonical(p, s, &float64_params));
6fff2167
AB
528}
529
530/* Simple helpers for checking if what NaN we have */
531static bool is_nan(FloatClass c)
532{
533 return unlikely(c >= float_class_qnan);
534}
535static bool is_snan(FloatClass c)
536{
537 return c == float_class_snan;
538}
539static bool is_qnan(FloatClass c)
540{
541 return c == float_class_qnan;
542}
543
dbe4d53a
AB
544static FloatParts return_nan(FloatParts a, float_status *s)
545{
546 switch (a.cls) {
547 case float_class_snan:
548 s->float_exception_flags |= float_flag_invalid;
0bcfbcbe 549 a = parts_silence_nan(a, s);
dbe4d53a
AB
550 /* fall through */
551 case float_class_qnan:
552 if (s->default_nan_mode) {
f7e598e2 553 return parts_default_nan(s);
dbe4d53a
AB
554 }
555 break;
556
557 default:
558 g_assert_not_reached();
559 }
560 return a;
561}
562
6fff2167
AB
563static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
564{
565 if (is_snan(a.cls) || is_snan(b.cls)) {
566 s->float_exception_flags |= float_flag_invalid;
567 }
568
569 if (s->default_nan_mode) {
f7e598e2 570 return parts_default_nan(s);
6fff2167
AB
571 } else {
572 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
573 is_qnan(b.cls), is_snan(b.cls),
574 a.frac > b.frac ||
575 (a.frac == b.frac && a.sign < b.sign))) {
576 a = b;
577 }
0bcfbcbe
RH
578 if (is_snan(a.cls)) {
579 return parts_silence_nan(a, s);
580 }
6fff2167
AB
581 }
582 return a;
583}
584
d446830a
AB
585static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
586 bool inf_zero, float_status *s)
587{
1839189b
PM
588 int which;
589
d446830a
AB
590 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
591 s->float_exception_flags |= float_flag_invalid;
592 }
593
1839189b
PM
594 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
595 is_qnan(b.cls), is_snan(b.cls),
596 is_qnan(c.cls), is_snan(c.cls),
597 inf_zero, s);
598
d446830a 599 if (s->default_nan_mode) {
1839189b
PM
600 /* Note that this check is after pickNaNMulAdd so that function
601 * has an opportunity to set the Invalid flag.
602 */
f7e598e2 603 which = 3;
1839189b 604 }
d446830a 605
1839189b
PM
606 switch (which) {
607 case 0:
608 break;
609 case 1:
610 a = b;
611 break;
612 case 2:
613 a = c;
614 break;
615 case 3:
f7e598e2 616 return parts_default_nan(s);
1839189b
PM
617 default:
618 g_assert_not_reached();
d446830a 619 }
1839189b 620
0bcfbcbe
RH
621 if (is_snan(a.cls)) {
622 return parts_silence_nan(a, s);
623 }
d446830a
AB
624 return a;
625}
626
6fff2167
AB
627/*
628 * Returns the result of adding or subtracting the values of the
629 * floating-point values `a' and `b'. The operation is performed
630 * according to the IEC/IEEE Standard for Binary Floating-Point
631 * Arithmetic.
632 */
633
634static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
635 float_status *s)
636{
637 bool a_sign = a.sign;
638 bool b_sign = b.sign ^ subtract;
639
640 if (a_sign != b_sign) {
641 /* Subtraction */
642
643 if (a.cls == float_class_normal && b.cls == float_class_normal) {
644 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
645 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
646 a.frac = a.frac - b.frac;
647 } else {
648 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
649 a.frac = b.frac - a.frac;
650 a.exp = b.exp;
651 a_sign ^= 1;
652 }
653
654 if (a.frac == 0) {
655 a.cls = float_class_zero;
656 a.sign = s->float_rounding_mode == float_round_down;
657 } else {
658 int shift = clz64(a.frac) - 1;
659 a.frac = a.frac << shift;
660 a.exp = a.exp - shift;
661 a.sign = a_sign;
662 }
663 return a;
664 }
665 if (is_nan(a.cls) || is_nan(b.cls)) {
666 return pick_nan(a, b, s);
667 }
668 if (a.cls == float_class_inf) {
669 if (b.cls == float_class_inf) {
670 float_raise(float_flag_invalid, s);
f7e598e2 671 return parts_default_nan(s);
6fff2167
AB
672 }
673 return a;
674 }
675 if (a.cls == float_class_zero && b.cls == float_class_zero) {
676 a.sign = s->float_rounding_mode == float_round_down;
677 return a;
678 }
679 if (a.cls == float_class_zero || b.cls == float_class_inf) {
680 b.sign = a_sign ^ 1;
681 return b;
682 }
683 if (b.cls == float_class_zero) {
684 return a;
685 }
686 } else {
687 /* Addition */
688 if (a.cls == float_class_normal && b.cls == float_class_normal) {
689 if (a.exp > b.exp) {
690 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
691 } else if (a.exp < b.exp) {
692 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
693 a.exp = b.exp;
694 }
695 a.frac += b.frac;
696 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
697 a.frac >>= 1;
698 a.exp += 1;
699 }
700 return a;
701 }
702 if (is_nan(a.cls) || is_nan(b.cls)) {
703 return pick_nan(a, b, s);
704 }
705 if (a.cls == float_class_inf || b.cls == float_class_zero) {
706 return a;
707 }
708 if (b.cls == float_class_inf || a.cls == float_class_zero) {
709 b.sign = b_sign;
710 return b;
711 }
712 }
713 g_assert_not_reached();
714}
715
716/*
717 * Returns the result of adding or subtracting the floating-point
718 * values `a' and `b'. The operation is performed according to the
719 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
720 */
721
722float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
723 float_status *status)
724{
725 FloatParts pa = float16_unpack_canonical(a, status);
726 FloatParts pb = float16_unpack_canonical(b, status);
727 FloatParts pr = addsub_floats(pa, pb, false, status);
728
729 return float16_round_pack_canonical(pr, status);
730}
731
732float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
733 float_status *status)
734{
735 FloatParts pa = float32_unpack_canonical(a, status);
736 FloatParts pb = float32_unpack_canonical(b, status);
737 FloatParts pr = addsub_floats(pa, pb, false, status);
738
739 return float32_round_pack_canonical(pr, status);
740}
741
742float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
743 float_status *status)
744{
745 FloatParts pa = float64_unpack_canonical(a, status);
746 FloatParts pb = float64_unpack_canonical(b, status);
747 FloatParts pr = addsub_floats(pa, pb, false, status);
748
749 return float64_round_pack_canonical(pr, status);
750}
751
752float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
753 float_status *status)
754{
755 FloatParts pa = float16_unpack_canonical(a, status);
756 FloatParts pb = float16_unpack_canonical(b, status);
757 FloatParts pr = addsub_floats(pa, pb, true, status);
758
759 return float16_round_pack_canonical(pr, status);
760}
761
762float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
763 float_status *status)
764{
765 FloatParts pa = float32_unpack_canonical(a, status);
766 FloatParts pb = float32_unpack_canonical(b, status);
767 FloatParts pr = addsub_floats(pa, pb, true, status);
768
769 return float32_round_pack_canonical(pr, status);
770}
771
772float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
773 float_status *status)
774{
775 FloatParts pa = float64_unpack_canonical(a, status);
776 FloatParts pb = float64_unpack_canonical(b, status);
777 FloatParts pr = addsub_floats(pa, pb, true, status);
778
779 return float64_round_pack_canonical(pr, status);
780}
781
74d707e2
AB
782/*
783 * Returns the result of multiplying the floating-point values `a' and
784 * `b'. The operation is performed according to the IEC/IEEE Standard
785 * for Binary Floating-Point Arithmetic.
786 */
787
788static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
789{
790 bool sign = a.sign ^ b.sign;
791
792 if (a.cls == float_class_normal && b.cls == float_class_normal) {
793 uint64_t hi, lo;
794 int exp = a.exp + b.exp;
795
796 mul64To128(a.frac, b.frac, &hi, &lo);
797 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
798 if (lo & DECOMPOSED_OVERFLOW_BIT) {
799 shift64RightJamming(lo, 1, &lo);
800 exp += 1;
801 }
802
803 /* Re-use a */
804 a.exp = exp;
805 a.sign = sign;
806 a.frac = lo;
807 return a;
808 }
809 /* handle all the NaN cases */
810 if (is_nan(a.cls) || is_nan(b.cls)) {
811 return pick_nan(a, b, s);
812 }
813 /* Inf * Zero == NaN */
814 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
815 (a.cls == float_class_zero && b.cls == float_class_inf)) {
816 s->float_exception_flags |= float_flag_invalid;
f7e598e2 817 return parts_default_nan(s);
74d707e2
AB
818 }
819 /* Multiply by 0 or Inf */
820 if (a.cls == float_class_inf || a.cls == float_class_zero) {
821 a.sign = sign;
822 return a;
823 }
824 if (b.cls == float_class_inf || b.cls == float_class_zero) {
825 b.sign = sign;
826 return b;
827 }
828 g_assert_not_reached();
829}
830
831float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
832 float_status *status)
833{
834 FloatParts pa = float16_unpack_canonical(a, status);
835 FloatParts pb = float16_unpack_canonical(b, status);
836 FloatParts pr = mul_floats(pa, pb, status);
837
838 return float16_round_pack_canonical(pr, status);
839}
840
841float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
842 float_status *status)
843{
844 FloatParts pa = float32_unpack_canonical(a, status);
845 FloatParts pb = float32_unpack_canonical(b, status);
846 FloatParts pr = mul_floats(pa, pb, status);
847
848 return float32_round_pack_canonical(pr, status);
849}
850
851float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
852 float_status *status)
853{
854 FloatParts pa = float64_unpack_canonical(a, status);
855 FloatParts pb = float64_unpack_canonical(b, status);
856 FloatParts pr = mul_floats(pa, pb, status);
857
858 return float64_round_pack_canonical(pr, status);
859}
860
d446830a
AB
861/*
862 * Returns the result of multiplying the floating-point values `a' and
863 * `b' then adding 'c', with no intermediate rounding step after the
864 * multiplication. The operation is performed according to the
865 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
866 * The flags argument allows the caller to select negation of the
867 * addend, the intermediate product, or the final result. (The
868 * difference between this and having the caller do a separate
869 * negation is that negating externally will flip the sign bit on
870 * NaNs.)
871 */
872
873static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
874 int flags, float_status *s)
875{
876 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
877 ((1 << float_class_inf) | (1 << float_class_zero));
878 bool p_sign;
879 bool sign_flip = flags & float_muladd_negate_result;
880 FloatClass p_class;
881 uint64_t hi, lo;
882 int p_exp;
883
884 /* It is implementation-defined whether the cases of (0,inf,qnan)
885 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
886 * they return if they do), so we have to hand this information
887 * off to the target-specific pick-a-NaN routine.
888 */
889 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
890 return pick_nan_muladd(a, b, c, inf_zero, s);
891 }
892
893 if (inf_zero) {
894 s->float_exception_flags |= float_flag_invalid;
f7e598e2 895 return parts_default_nan(s);
d446830a
AB
896 }
897
898 if (flags & float_muladd_negate_c) {
899 c.sign ^= 1;
900 }
901
902 p_sign = a.sign ^ b.sign;
903
904 if (flags & float_muladd_negate_product) {
905 p_sign ^= 1;
906 }
907
908 if (a.cls == float_class_inf || b.cls == float_class_inf) {
909 p_class = float_class_inf;
910 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
911 p_class = float_class_zero;
912 } else {
913 p_class = float_class_normal;
914 }
915
916 if (c.cls == float_class_inf) {
917 if (p_class == float_class_inf && p_sign != c.sign) {
918 s->float_exception_flags |= float_flag_invalid;
f7e598e2 919 return parts_default_nan(s);
d446830a
AB
920 } else {
921 a.cls = float_class_inf;
922 a.sign = c.sign ^ sign_flip;
f7e598e2 923 return a;
d446830a 924 }
d446830a
AB
925 }
926
927 if (p_class == float_class_inf) {
928 a.cls = float_class_inf;
929 a.sign = p_sign ^ sign_flip;
930 return a;
931 }
932
933 if (p_class == float_class_zero) {
934 if (c.cls == float_class_zero) {
935 if (p_sign != c.sign) {
936 p_sign = s->float_rounding_mode == float_round_down;
937 }
938 c.sign = p_sign;
939 } else if (flags & float_muladd_halve_result) {
940 c.exp -= 1;
941 }
942 c.sign ^= sign_flip;
943 return c;
944 }
945
946 /* a & b should be normals now... */
947 assert(a.cls == float_class_normal &&
948 b.cls == float_class_normal);
949
950 p_exp = a.exp + b.exp;
951
952 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
953 * result.
954 */
955 mul64To128(a.frac, b.frac, &hi, &lo);
956 /* binary point now at bit 124 */
957
958 /* check for overflow */
959 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
960 shift128RightJamming(hi, lo, 1, &hi, &lo);
961 p_exp += 1;
962 }
963
964 /* + add/sub */
965 if (c.cls == float_class_zero) {
966 /* move binary point back to 62 */
967 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
968 } else {
969 int exp_diff = p_exp - c.exp;
970 if (p_sign == c.sign) {
971 /* Addition */
972 if (exp_diff <= 0) {
973 shift128RightJamming(hi, lo,
974 DECOMPOSED_BINARY_POINT - exp_diff,
975 &hi, &lo);
976 lo += c.frac;
977 p_exp = c.exp;
978 } else {
979 uint64_t c_hi, c_lo;
980 /* shift c to the same binary point as the product (124) */
981 c_hi = c.frac >> 2;
982 c_lo = 0;
983 shift128RightJamming(c_hi, c_lo,
984 exp_diff,
985 &c_hi, &c_lo);
986 add128(hi, lo, c_hi, c_lo, &hi, &lo);
987 /* move binary point back to 62 */
988 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
989 }
990
991 if (lo & DECOMPOSED_OVERFLOW_BIT) {
992 shift64RightJamming(lo, 1, &lo);
993 p_exp += 1;
994 }
995
996 } else {
997 /* Subtraction */
998 uint64_t c_hi, c_lo;
999 /* make C binary point match product at bit 124 */
1000 c_hi = c.frac >> 2;
1001 c_lo = 0;
1002
1003 if (exp_diff <= 0) {
1004 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1005 if (exp_diff == 0
1006 &&
1007 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1008 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1009 } else {
1010 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1011 p_sign ^= 1;
1012 p_exp = c.exp;
1013 }
1014 } else {
1015 shift128RightJamming(c_hi, c_lo,
1016 exp_diff,
1017 &c_hi, &c_lo);
1018 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1019 }
1020
1021 if (hi == 0 && lo == 0) {
1022 a.cls = float_class_zero;
1023 a.sign = s->float_rounding_mode == float_round_down;
1024 a.sign ^= sign_flip;
1025 return a;
1026 } else {
1027 int shift;
1028 if (hi != 0) {
1029 shift = clz64(hi);
1030 } else {
1031 shift = clz64(lo) + 64;
1032 }
1033 /* Normalizing to a binary point of 124 is the
1034 correct adjust for the exponent. However since we're
1035 shifting, we might as well put the binary point back
1036 at 62 where we really want it. Therefore shift as
1037 if we're leaving 1 bit at the top of the word, but
1038 adjust the exponent as if we're leaving 3 bits. */
1039 shift -= 1;
1040 if (shift >= 64) {
1041 lo = lo << (shift - 64);
1042 } else {
1043 hi = (hi << shift) | (lo >> (64 - shift));
1044 lo = hi | ((lo << shift) != 0);
1045 }
1046 p_exp -= shift - 2;
1047 }
1048 }
1049 }
1050
1051 if (flags & float_muladd_halve_result) {
1052 p_exp -= 1;
1053 }
1054
1055 /* finally prepare our result */
1056 a.cls = float_class_normal;
1057 a.sign = p_sign ^ sign_flip;
1058 a.exp = p_exp;
1059 a.frac = lo;
1060
1061 return a;
1062}
1063
1064float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1065 int flags, float_status *status)
1066{
1067 FloatParts pa = float16_unpack_canonical(a, status);
1068 FloatParts pb = float16_unpack_canonical(b, status);
1069 FloatParts pc = float16_unpack_canonical(c, status);
1070 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1071
1072 return float16_round_pack_canonical(pr, status);
1073}
1074
1075float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1076 int flags, float_status *status)
1077{
1078 FloatParts pa = float32_unpack_canonical(a, status);
1079 FloatParts pb = float32_unpack_canonical(b, status);
1080 FloatParts pc = float32_unpack_canonical(c, status);
1081 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1082
1083 return float32_round_pack_canonical(pr, status);
1084}
1085
1086float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1087 int flags, float_status *status)
1088{
1089 FloatParts pa = float64_unpack_canonical(a, status);
1090 FloatParts pb = float64_unpack_canonical(b, status);
1091 FloatParts pc = float64_unpack_canonical(c, status);
1092 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1093
1094 return float64_round_pack_canonical(pr, status);
1095}
1096
cf07323d
AB
1097/*
1098 * Returns the result of dividing the floating-point value `a' by the
1099 * corresponding value `b'. The operation is performed according to
1100 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1101 */
1102
1103static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1104{
1105 bool sign = a.sign ^ b.sign;
1106
1107 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1108 uint64_t temp_lo, temp_hi;
1109 int exp = a.exp - b.exp;
1110 if (a.frac < b.frac) {
1111 exp -= 1;
1112 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1113 &temp_hi, &temp_lo);
1114 } else {
1115 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1116 &temp_hi, &temp_lo);
1117 }
1118 /* LSB of quot is set if inexact which roundandpack will use
1119 * to set flags. Yet again we re-use a for the result */
1120 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1121 a.sign = sign;
1122 a.exp = exp;
1123 return a;
1124 }
1125 /* handle all the NaN cases */
1126 if (is_nan(a.cls) || is_nan(b.cls)) {
1127 return pick_nan(a, b, s);
1128 }
1129 /* 0/0 or Inf/Inf */
1130 if (a.cls == b.cls
1131 &&
1132 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1133 s->float_exception_flags |= float_flag_invalid;
f7e598e2 1134 return parts_default_nan(s);
cf07323d 1135 }
9cb4e398
AB
1136 /* Inf / x or 0 / x */
1137 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1138 a.sign = sign;
1139 return a;
1140 }
cf07323d
AB
1141 /* Div 0 => Inf */
1142 if (b.cls == float_class_zero) {
1143 s->float_exception_flags |= float_flag_divbyzero;
1144 a.cls = float_class_inf;
1145 a.sign = sign;
1146 return a;
1147 }
cf07323d
AB
1148 /* Div by Inf */
1149 if (b.cls == float_class_inf) {
1150 a.cls = float_class_zero;
1151 a.sign = sign;
1152 return a;
1153 }
1154 g_assert_not_reached();
1155}
1156
1157float16 float16_div(float16 a, float16 b, float_status *status)
1158{
1159 FloatParts pa = float16_unpack_canonical(a, status);
1160 FloatParts pb = float16_unpack_canonical(b, status);
1161 FloatParts pr = div_floats(pa, pb, status);
1162
1163 return float16_round_pack_canonical(pr, status);
1164}
1165
1166float32 float32_div(float32 a, float32 b, float_status *status)
1167{
1168 FloatParts pa = float32_unpack_canonical(a, status);
1169 FloatParts pb = float32_unpack_canonical(b, status);
1170 FloatParts pr = div_floats(pa, pb, status);
1171
1172 return float32_round_pack_canonical(pr, status);
1173}
1174
1175float64 float64_div(float64 a, float64 b, float_status *status)
1176{
1177 FloatParts pa = float64_unpack_canonical(a, status);
1178 FloatParts pb = float64_unpack_canonical(b, status);
1179 FloatParts pr = div_floats(pa, pb, status);
1180
1181 return float64_round_pack_canonical(pr, status);
1182}
1183
dbe4d53a
AB
1184/*
1185 * Rounds the floating-point value `a' to an integer, and returns the
1186 * result as a floating-point value. The operation is performed
1187 * according to the IEC/IEEE Standard for Binary Floating-Point
1188 * Arithmetic.
1189 */
1190
1191static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1192{
1193 if (is_nan(a.cls)) {
1194 return return_nan(a, s);
1195 }
1196
1197 switch (a.cls) {
1198 case float_class_zero:
1199 case float_class_inf:
1200 case float_class_qnan:
1201 /* already "integral" */
1202 break;
1203 case float_class_normal:
1204 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1205 /* already integral */
1206 break;
1207 }
1208 if (a.exp < 0) {
1209 bool one;
1210 /* all fractional */
1211 s->float_exception_flags |= float_flag_inexact;
1212 switch (rounding_mode) {
1213 case float_round_nearest_even:
1214 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1215 break;
1216 case float_round_ties_away:
1217 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1218 break;
1219 case float_round_to_zero:
1220 one = false;
1221 break;
1222 case float_round_up:
1223 one = !a.sign;
1224 break;
1225 case float_round_down:
1226 one = a.sign;
1227 break;
1228 default:
1229 g_assert_not_reached();
1230 }
1231
1232 if (one) {
1233 a.frac = DECOMPOSED_IMPLICIT_BIT;
1234 a.exp = 0;
1235 } else {
1236 a.cls = float_class_zero;
1237 }
1238 } else {
1239 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1240 uint64_t frac_lsbm1 = frac_lsb >> 1;
1241 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1242 uint64_t rnd_mask = rnd_even_mask >> 1;
1243 uint64_t inc;
1244
1245 switch (rounding_mode) {
1246 case float_round_nearest_even:
1247 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1248 break;
1249 case float_round_ties_away:
1250 inc = frac_lsbm1;
1251 break;
1252 case float_round_to_zero:
1253 inc = 0;
1254 break;
1255 case float_round_up:
1256 inc = a.sign ? 0 : rnd_mask;
1257 break;
1258 case float_round_down:
1259 inc = a.sign ? rnd_mask : 0;
1260 break;
1261 default:
1262 g_assert_not_reached();
1263 }
1264
1265 if (a.frac & rnd_mask) {
1266 s->float_exception_flags |= float_flag_inexact;
1267 a.frac += inc;
1268 a.frac &= ~rnd_mask;
1269 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1270 a.frac >>= 1;
1271 a.exp++;
1272 }
1273 }
1274 }
1275 break;
1276 default:
1277 g_assert_not_reached();
1278 }
1279 return a;
1280}
1281
1282float16 float16_round_to_int(float16 a, float_status *s)
1283{
1284 FloatParts pa = float16_unpack_canonical(a, s);
1285 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1286 return float16_round_pack_canonical(pr, s);
1287}
1288
1289float32 float32_round_to_int(float32 a, float_status *s)
1290{
1291 FloatParts pa = float32_unpack_canonical(a, s);
1292 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1293 return float32_round_pack_canonical(pr, s);
1294}
1295
1296float64 float64_round_to_int(float64 a, float_status *s)
1297{
1298 FloatParts pa = float64_unpack_canonical(a, s);
1299 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1300 return float64_round_pack_canonical(pr, s);
1301}
1302
1303float64 float64_trunc_to_int(float64 a, float_status *s)
1304{
1305 FloatParts pa = float64_unpack_canonical(a, s);
1306 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1307 return float64_round_pack_canonical(pr, s);
1308}
1309
ab52f973
AB
1310/*
1311 * Returns the result of converting the floating-point value `a' to
1312 * the two's complement integer format. The conversion is performed
1313 * according to the IEC/IEEE Standard for Binary Floating-Point
1314 * Arithmetic---which means in particular that the conversion is
1315 * rounded according to the current rounding mode. If `a' is a NaN,
1316 * the largest positive integer is returned. Otherwise, if the
1317 * conversion overflows, the largest integer with the same sign as `a'
1318 * is returned.
1319*/
1320
1321static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1322 int64_t min, int64_t max,
1323 float_status *s)
1324{
1325 uint64_t r;
1326 int orig_flags = get_float_exception_flags(s);
1327 FloatParts p = round_to_int(in, rmode, s);
1328
1329 switch (p.cls) {
1330 case float_class_snan:
1331 case float_class_qnan:
801bc563 1332 s->float_exception_flags = orig_flags | float_flag_invalid;
ab52f973
AB
1333 return max;
1334 case float_class_inf:
801bc563 1335 s->float_exception_flags = orig_flags | float_flag_invalid;
ab52f973
AB
1336 return p.sign ? min : max;
1337 case float_class_zero:
1338 return 0;
1339 case float_class_normal:
1340 if (p.exp < DECOMPOSED_BINARY_POINT) {
1341 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1342 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1343 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1344 } else {
1345 r = UINT64_MAX;
1346 }
1347 if (p.sign) {
33358375 1348 if (r <= -(uint64_t) min) {
ab52f973
AB
1349 return -r;
1350 } else {
1351 s->float_exception_flags = orig_flags | float_flag_invalid;
1352 return min;
1353 }
1354 } else {
33358375 1355 if (r <= max) {
ab52f973
AB
1356 return r;
1357 } else {
1358 s->float_exception_flags = orig_flags | float_flag_invalid;
1359 return max;
1360 }
1361 }
1362 default:
1363 g_assert_not_reached();
1364 }
1365}
1366
1367#define FLOAT_TO_INT(fsz, isz) \
1368int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1369 float_status *s) \
1370{ \
1371 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1372 return round_to_int_and_pack(p, s->float_rounding_mode, \
1373 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1374 s); \
1375} \
1376 \
1377int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1378 (float ## fsz a, float_status *s) \
1379{ \
1380 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1381 return round_to_int_and_pack(p, float_round_to_zero, \
1382 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1383 s); \
1384}
1385
1386FLOAT_TO_INT(16, 16)
1387FLOAT_TO_INT(16, 32)
1388FLOAT_TO_INT(16, 64)
1389
1390FLOAT_TO_INT(32, 16)
1391FLOAT_TO_INT(32, 32)
1392FLOAT_TO_INT(32, 64)
1393
1394FLOAT_TO_INT(64, 16)
1395FLOAT_TO_INT(64, 32)
1396FLOAT_TO_INT(64, 64)
1397
1398#undef FLOAT_TO_INT
1399
1400/*
1401 * Returns the result of converting the floating-point value `a' to
1402 * the unsigned integer format. The conversion is performed according
1403 * to the IEC/IEEE Standard for Binary Floating-Point
1404 * Arithmetic---which means in particular that the conversion is
1405 * rounded according to the current rounding mode. If `a' is a NaN,
1406 * the largest unsigned integer is returned. Otherwise, if the
1407 * conversion overflows, the largest unsigned integer is returned. If
1408 * the 'a' is negative, the result is rounded and zero is returned;
1409 * values that do not round to zero will raise the inexact exception
1410 * flag.
1411 */
1412
1413static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1414 float_status *s)
1415{
1416 int orig_flags = get_float_exception_flags(s);
1417 FloatParts p = round_to_int(in, rmode, s);
1418
1419 switch (p.cls) {
1420 case float_class_snan:
1421 case float_class_qnan:
1422 s->float_exception_flags = orig_flags | float_flag_invalid;
1423 return max;
1424 case float_class_inf:
801bc563 1425 s->float_exception_flags = orig_flags | float_flag_invalid;
ab52f973
AB
1426 return p.sign ? 0 : max;
1427 case float_class_zero:
1428 return 0;
1429 case float_class_normal:
1430 {
1431 uint64_t r;
1432 if (p.sign) {
1433 s->float_exception_flags = orig_flags | float_flag_invalid;
1434 return 0;
1435 }
1436
1437 if (p.exp < DECOMPOSED_BINARY_POINT) {
1438 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1439 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1440 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1441 } else {
1442 s->float_exception_flags = orig_flags | float_flag_invalid;
1443 return max;
1444 }
1445
1446 /* For uint64 this will never trip, but if p.exp is too large
1447 * to shift a decomposed fraction we shall have exited via the
1448 * 3rd leg above.
1449 */
1450 if (r > max) {
1451 s->float_exception_flags = orig_flags | float_flag_invalid;
1452 return max;
1453 } else {
1454 return r;
1455 }
1456 }
1457 default:
1458 g_assert_not_reached();
1459 }
1460}
1461
1462#define FLOAT_TO_UINT(fsz, isz) \
1463uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1464 float_status *s) \
1465{ \
1466 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1467 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1468 UINT ## isz ## _MAX, s); \
1469} \
1470 \
1471uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1472 (float ## fsz a, float_status *s) \
1473{ \
1474 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
bd49e602
RH
1475 return round_to_uint_and_pack(p, float_round_to_zero, \
1476 UINT ## isz ## _MAX, s); \
ab52f973
AB
1477}
1478
1479FLOAT_TO_UINT(16, 16)
1480FLOAT_TO_UINT(16, 32)
1481FLOAT_TO_UINT(16, 64)
1482
1483FLOAT_TO_UINT(32, 16)
1484FLOAT_TO_UINT(32, 32)
1485FLOAT_TO_UINT(32, 64)
1486
1487FLOAT_TO_UINT(64, 16)
1488FLOAT_TO_UINT(64, 32)
1489FLOAT_TO_UINT(64, 64)
1490
1491#undef FLOAT_TO_UINT
1492
c02e1fb8
AB
1493/*
1494 * Integer to float conversions
1495 *
1496 * Returns the result of converting the two's complement integer `a'
1497 * to the floating-point format. The conversion is performed according
1498 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 */
1500
1501static FloatParts int_to_float(int64_t a, float_status *status)
1502{
a5a5f5e2 1503 FloatParts r = {};
c02e1fb8
AB
1504 if (a == 0) {
1505 r.cls = float_class_zero;
1506 r.sign = false;
1507 } else if (a == (1ULL << 63)) {
1508 r.cls = float_class_normal;
1509 r.sign = true;
1510 r.frac = DECOMPOSED_IMPLICIT_BIT;
1511 r.exp = 63;
1512 } else {
1513 uint64_t f;
1514 if (a < 0) {
1515 f = -a;
1516 r.sign = true;
1517 } else {
1518 f = a;
1519 r.sign = false;
1520 }
1521 int shift = clz64(f) - 1;
1522 r.cls = float_class_normal;
1523 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1524 r.frac = f << shift;
1525 }
1526
1527 return r;
1528}
1529
1530float16 int64_to_float16(int64_t a, float_status *status)
1531{
1532 FloatParts pa = int_to_float(a, status);
1533 return float16_round_pack_canonical(pa, status);
1534}
1535
1536float16 int32_to_float16(int32_t a, float_status *status)
1537{
1538 return int64_to_float16(a, status);
1539}
1540
1541float16 int16_to_float16(int16_t a, float_status *status)
1542{
1543 return int64_to_float16(a, status);
1544}
1545
1546float32 int64_to_float32(int64_t a, float_status *status)
1547{
1548 FloatParts pa = int_to_float(a, status);
1549 return float32_round_pack_canonical(pa, status);
1550}
1551
1552float32 int32_to_float32(int32_t a, float_status *status)
1553{
1554 return int64_to_float32(a, status);
1555}
1556
1557float32 int16_to_float32(int16_t a, float_status *status)
1558{
1559 return int64_to_float32(a, status);
1560}
1561
1562float64 int64_to_float64(int64_t a, float_status *status)
1563{
1564 FloatParts pa = int_to_float(a, status);
1565 return float64_round_pack_canonical(pa, status);
1566}
1567
1568float64 int32_to_float64(int32_t a, float_status *status)
1569{
1570 return int64_to_float64(a, status);
1571}
1572
1573float64 int16_to_float64(int16_t a, float_status *status)
1574{
1575 return int64_to_float64(a, status);
1576}
1577
1578
1579/*
1580 * Unsigned Integer to float conversions
1581 *
1582 * Returns the result of converting the unsigned integer `a' to the
1583 * floating-point format. The conversion is performed according to the
1584 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1585 */
1586
1587static FloatParts uint_to_float(uint64_t a, float_status *status)
1588{
1589 FloatParts r = { .sign = false};
1590
1591 if (a == 0) {
1592 r.cls = float_class_zero;
1593 } else {
1594 int spare_bits = clz64(a) - 1;
1595 r.cls = float_class_normal;
1596 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1597 if (spare_bits < 0) {
1598 shift64RightJamming(a, -spare_bits, &a);
1599 r.frac = a;
1600 } else {
1601 r.frac = a << spare_bits;
1602 }
1603 }
1604
1605 return r;
1606}
1607
1608float16 uint64_to_float16(uint64_t a, float_status *status)
1609{
1610 FloatParts pa = uint_to_float(a, status);
1611 return float16_round_pack_canonical(pa, status);
1612}
1613
1614float16 uint32_to_float16(uint32_t a, float_status *status)
1615{
1616 return uint64_to_float16(a, status);
1617}
1618
1619float16 uint16_to_float16(uint16_t a, float_status *status)
1620{
1621 return uint64_to_float16(a, status);
1622}
1623
1624float32 uint64_to_float32(uint64_t a, float_status *status)
1625{
1626 FloatParts pa = uint_to_float(a, status);
1627 return float32_round_pack_canonical(pa, status);
1628}
1629
1630float32 uint32_to_float32(uint32_t a, float_status *status)
1631{
1632 return uint64_to_float32(a, status);
1633}
1634
1635float32 uint16_to_float32(uint16_t a, float_status *status)
1636{
1637 return uint64_to_float32(a, status);
1638}
1639
1640float64 uint64_to_float64(uint64_t a, float_status *status)
1641{
1642 FloatParts pa = uint_to_float(a, status);
1643 return float64_round_pack_canonical(pa, status);
1644}
1645
1646float64 uint32_to_float64(uint32_t a, float_status *status)
1647{
1648 return uint64_to_float64(a, status);
1649}
1650
1651float64 uint16_to_float64(uint16_t a, float_status *status)
1652{
1653 return uint64_to_float64(a, status);
1654}
1655
89360067
AB
1656/* Float Min/Max */
1657/* min() and max() functions. These can't be implemented as
1658 * 'compare and pick one input' because that would mishandle
1659 * NaNs and +0 vs -0.
1660 *
1661 * minnum() and maxnum() functions. These are similar to the min()
1662 * and max() functions but if one of the arguments is a QNaN and
1663 * the other is numerical then the numerical argument is returned.
1664 * SNaNs will get quietened before being returned.
1665 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1666 * and maxNum() operations. min() and max() are the typical min/max
1667 * semantics provided by many CPUs which predate that specification.
1668 *
1669 * minnummag() and maxnummag() functions correspond to minNumMag()
1670 * and minNumMag() from the IEEE-754 2008.
1671 */
1672static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1673 bool ieee, bool ismag, float_status *s)
1674{
1675 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1676 if (ieee) {
1677 /* Takes two floating-point values `a' and `b', one of
1678 * which is a NaN, and returns the appropriate NaN
1679 * result. If either `a' or `b' is a signaling NaN,
1680 * the invalid exception is raised.
1681 */
1682 if (is_snan(a.cls) || is_snan(b.cls)) {
1683 return pick_nan(a, b, s);
1684 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1685 return b;
1686 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1687 return a;
1688 }
1689 }
1690 return pick_nan(a, b, s);
1691 } else {
1692 int a_exp, b_exp;
89360067
AB
1693
1694 switch (a.cls) {
1695 case float_class_normal:
1696 a_exp = a.exp;
1697 break;
1698 case float_class_inf:
1699 a_exp = INT_MAX;
1700 break;
1701 case float_class_zero:
1702 a_exp = INT_MIN;
1703 break;
1704 default:
1705 g_assert_not_reached();
1706 break;
1707 }
1708 switch (b.cls) {
1709 case float_class_normal:
1710 b_exp = b.exp;
1711 break;
1712 case float_class_inf:
1713 b_exp = INT_MAX;
1714 break;
1715 case float_class_zero:
1716 b_exp = INT_MIN;
1717 break;
1718 default:
1719 g_assert_not_reached();
1720 break;
1721 }
1722
6245327a
EC
1723 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1724 bool a_less = a_exp < b_exp;
1725 if (a_exp == b_exp) {
1726 a_less = a.frac < b.frac;
1727 }
1728 return a_less ^ ismin ? b : a;
89360067
AB
1729 }
1730
6245327a 1731 if (a.sign == b.sign) {
89360067
AB
1732 bool a_less = a_exp < b_exp;
1733 if (a_exp == b_exp) {
1734 a_less = a.frac < b.frac;
1735 }
6245327a 1736 return a.sign ^ a_less ^ ismin ? b : a;
89360067 1737 } else {
6245327a 1738 return a.sign ^ ismin ? b : a;
89360067
AB
1739 }
1740 }
1741}
1742
1743#define MINMAX(sz, name, ismin, isiee, ismag) \
1744float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1745 float_status *s) \
1746{ \
1747 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1748 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1749 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1750 \
1751 return float ## sz ## _round_pack_canonical(pr, s); \
1752}
1753
1754MINMAX(16, min, true, false, false)
1755MINMAX(16, minnum, true, true, false)
1756MINMAX(16, minnummag, true, true, true)
1757MINMAX(16, max, false, false, false)
1758MINMAX(16, maxnum, false, true, false)
1759MINMAX(16, maxnummag, false, true, true)
1760
1761MINMAX(32, min, true, false, false)
1762MINMAX(32, minnum, true, true, false)
1763MINMAX(32, minnummag, true, true, true)
1764MINMAX(32, max, false, false, false)
1765MINMAX(32, maxnum, false, true, false)
1766MINMAX(32, maxnummag, false, true, true)
1767
1768MINMAX(64, min, true, false, false)
1769MINMAX(64, minnum, true, true, false)
1770MINMAX(64, minnummag, true, true, true)
1771MINMAX(64, max, false, false, false)
1772MINMAX(64, maxnum, false, true, false)
1773MINMAX(64, maxnummag, false, true, true)
1774
1775#undef MINMAX
1776
0c4c9092
AB
1777/* Floating point compare */
1778static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1779 float_status *s)
1780{
1781 if (is_nan(a.cls) || is_nan(b.cls)) {
1782 if (!is_quiet ||
1783 a.cls == float_class_snan ||
1784 b.cls == float_class_snan) {
1785 s->float_exception_flags |= float_flag_invalid;
1786 }
1787 return float_relation_unordered;
1788 }
1789
1790 if (a.cls == float_class_zero) {
1791 if (b.cls == float_class_zero) {
1792 return float_relation_equal;
1793 }
1794 return b.sign ? float_relation_greater : float_relation_less;
1795 } else if (b.cls == float_class_zero) {
1796 return a.sign ? float_relation_less : float_relation_greater;
1797 }
1798
1799 /* The only really important thing about infinity is its sign. If
1800 * both are infinities the sign marks the smallest of the two.
1801 */
1802 if (a.cls == float_class_inf) {
1803 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1804 return float_relation_equal;
1805 }
1806 return a.sign ? float_relation_less : float_relation_greater;
1807 } else if (b.cls == float_class_inf) {
1808 return b.sign ? float_relation_greater : float_relation_less;
1809 }
1810
1811 if (a.sign != b.sign) {
1812 return a.sign ? float_relation_less : float_relation_greater;
1813 }
1814
1815 if (a.exp == b.exp) {
1816 if (a.frac == b.frac) {
1817 return float_relation_equal;
1818 }
1819 if (a.sign) {
1820 return a.frac > b.frac ?
1821 float_relation_less : float_relation_greater;
1822 } else {
1823 return a.frac > b.frac ?
1824 float_relation_greater : float_relation_less;
1825 }
1826 } else {
1827 if (a.sign) {
1828 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1829 } else {
1830 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1831 }
1832 }
1833}
1834
1835#define COMPARE(sz) \
1836int float ## sz ## _compare(float ## sz a, float ## sz b, \
1837 float_status *s) \
1838{ \
1839 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1840 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1841 return compare_floats(pa, pb, false, s); \
1842} \
1843int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1844 float_status *s) \
1845{ \
1846 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1847 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1848 return compare_floats(pa, pb, true, s); \
1849}
1850
1851COMPARE(16)
1852COMPARE(32)
1853COMPARE(64)
1854
1855#undef COMPARE
1856
0bfc9f19
AB
1857/* Multiply A by 2 raised to the power N. */
1858static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1859{
1860 if (unlikely(is_nan(a.cls))) {
1861 return return_nan(a, s);
1862 }
1863 if (a.cls == float_class_normal) {
ce8d4082
RH
1864 /* The largest float type (even though not supported by FloatParts)
1865 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1866 * still allows rounding to infinity, without allowing overflow
1867 * within the int32_t that backs FloatParts.exp.
1868 */
1869 n = MIN(MAX(n, -0x10000), 0x10000);
0bfc9f19
AB
1870 a.exp += n;
1871 }
1872 return a;
1873}
1874
1875float16 float16_scalbn(float16 a, int n, float_status *status)
1876{
1877 FloatParts pa = float16_unpack_canonical(a, status);
1878 FloatParts pr = scalbn_decomposed(pa, n, status);
1879 return float16_round_pack_canonical(pr, status);
1880}
1881
1882float32 float32_scalbn(float32 a, int n, float_status *status)
1883{
1884 FloatParts pa = float32_unpack_canonical(a, status);
1885 FloatParts pr = scalbn_decomposed(pa, n, status);
1886 return float32_round_pack_canonical(pr, status);
1887}
1888
1889float64 float64_scalbn(float64 a, int n, float_status *status)
1890{
1891 FloatParts pa = float64_unpack_canonical(a, status);
1892 FloatParts pr = scalbn_decomposed(pa, n, status);
1893 return float64_round_pack_canonical(pr, status);
1894}
1895
c13bb2da
AB
1896/*
1897 * Square Root
1898 *
1899 * The old softfloat code did an approximation step before zeroing in
1900 * on the final result. However for simpleness we just compute the
1901 * square root by iterating down from the implicit bit to enough extra
1902 * bits to ensure we get a correctly rounded result.
1903 *
1904 * This does mean however the calculation is slower than before,
1905 * especially for 64 bit floats.
1906 */
1907
1908static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1909{
1910 uint64_t a_frac, r_frac, s_frac;
1911 int bit, last_bit;
1912
1913 if (is_nan(a.cls)) {
1914 return return_nan(a, s);
1915 }
1916 if (a.cls == float_class_zero) {
1917 return a; /* sqrt(+-0) = +-0 */
1918 }
1919 if (a.sign) {
1920 s->float_exception_flags |= float_flag_invalid;
f7e598e2 1921 return parts_default_nan(s);
c13bb2da
AB
1922 }
1923 if (a.cls == float_class_inf) {
1924 return a; /* sqrt(+inf) = +inf */
1925 }
1926
1927 assert(a.cls == float_class_normal);
1928
1929 /* We need two overflow bits at the top. Adding room for that is a
1930 * right shift. If the exponent is odd, we can discard the low bit
1931 * by multiplying the fraction by 2; that's a left shift. Combine
1932 * those and we shift right if the exponent is even.
1933 */
1934 a_frac = a.frac;
1935 if (!(a.exp & 1)) {
1936 a_frac >>= 1;
1937 }
1938 a.exp >>= 1;
1939
1940 /* Bit-by-bit computation of sqrt. */
1941 r_frac = 0;
1942 s_frac = 0;
1943
1944 /* Iterate from implicit bit down to the 3 extra bits to compute a
1945 * properly rounded result. Remember we've inserted one more bit
1946 * at the top, so these positions are one less.
1947 */
1948 bit = DECOMPOSED_BINARY_POINT - 1;
1949 last_bit = MAX(p->frac_shift - 4, 0);
1950 do {
1951 uint64_t q = 1ULL << bit;
1952 uint64_t t_frac = s_frac + q;
1953 if (t_frac <= a_frac) {
1954 s_frac = t_frac + q;
1955 a_frac -= t_frac;
1956 r_frac += q;
1957 }
1958 a_frac <<= 1;
1959 } while (--bit >= last_bit);
1960
1961 /* Undo the right shift done above. If there is any remaining
1962 * fraction, the result is inexact. Set the sticky bit.
1963 */
1964 a.frac = (r_frac << 1) + (a_frac != 0);
1965
1966 return a;
1967}
1968
1969float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1970{
1971 FloatParts pa = float16_unpack_canonical(a, status);
1972 FloatParts pr = sqrt_float(pa, status, &float16_params);
1973 return float16_round_pack_canonical(pr, status);
1974}
1975
1976float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1977{
1978 FloatParts pa = float32_unpack_canonical(a, status);
1979 FloatParts pr = sqrt_float(pa, status, &float32_params);
1980 return float32_round_pack_canonical(pr, status);
1981}
1982
1983float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1984{
1985 FloatParts pa = float64_unpack_canonical(a, status);
1986 FloatParts pr = sqrt_float(pa, status, &float64_params);
1987 return float64_round_pack_canonical(pr, status);
1988}
1989
1990
158142c2
FB
1991/*----------------------------------------------------------------------------
1992| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1993| and 7, and returns the properly rounded 32-bit integer corresponding to the
1994| input. If `zSign' is 1, the input is negated before being converted to an
1995| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
1996| is simply rounded to an integer, with the inexact exception raised if the
1997| input cannot be represented exactly as an integer. However, if the fixed-
1998| point input is too large, the invalid exception is raised and the largest
1999| positive or negative integer is returned.
2000*----------------------------------------------------------------------------*/
2001
f4014512 2002static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
158142c2 2003{
8f506c70 2004 int8_t roundingMode;
158142c2 2005 flag roundNearestEven;
8f506c70 2006 int8_t roundIncrement, roundBits;
760e1416 2007 int32_t z;
158142c2 2008
a2f2d288 2009 roundingMode = status->float_rounding_mode;
158142c2 2010 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2011 switch (roundingMode) {
2012 case float_round_nearest_even:
f9288a76 2013 case float_round_ties_away:
dc355b76
PM
2014 roundIncrement = 0x40;
2015 break;
2016 case float_round_to_zero:
2017 roundIncrement = 0;
2018 break;
2019 case float_round_up:
2020 roundIncrement = zSign ? 0 : 0x7f;
2021 break;
2022 case float_round_down:
2023 roundIncrement = zSign ? 0x7f : 0;
2024 break;
2025 default:
2026 abort();
158142c2
FB
2027 }
2028 roundBits = absZ & 0x7F;
2029 absZ = ( absZ + roundIncrement )>>7;
2030 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2031 z = absZ;
2032 if ( zSign ) z = - z;
2033 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
ff32e16e 2034 float_raise(float_flag_invalid, status);
bb98fe42 2035 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2 2036 }
a2f2d288
PM
2037 if (roundBits) {
2038 status->float_exception_flags |= float_flag_inexact;
2039 }
158142c2
FB
2040 return z;
2041
2042}
2043
2044/*----------------------------------------------------------------------------
2045| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2046| `absZ1', with binary point between bits 63 and 64 (between the input words),
2047| and returns the properly rounded 64-bit integer corresponding to the input.
2048| If `zSign' is 1, the input is negated before being converted to an integer.
2049| Ordinarily, the fixed-point input is simply rounded to an integer, with
2050| the inexact exception raised if the input cannot be represented exactly as
2051| an integer. However, if the fixed-point input is too large, the invalid
2052| exception is raised and the largest positive or negative integer is
2053| returned.
2054*----------------------------------------------------------------------------*/
2055
f42c2224 2056static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
e5a41ffa 2057 float_status *status)
158142c2 2058{
8f506c70 2059 int8_t roundingMode;
158142c2 2060 flag roundNearestEven, increment;
760e1416 2061 int64_t z;
158142c2 2062
a2f2d288 2063 roundingMode = status->float_rounding_mode;
158142c2 2064 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2065 switch (roundingMode) {
2066 case float_round_nearest_even:
f9288a76 2067 case float_round_ties_away:
dc355b76
PM
2068 increment = ((int64_t) absZ1 < 0);
2069 break;
2070 case float_round_to_zero:
2071 increment = 0;
2072 break;
2073 case float_round_up:
2074 increment = !zSign && absZ1;
2075 break;
2076 case float_round_down:
2077 increment = zSign && absZ1;
2078 break;
2079 default:
2080 abort();
158142c2
FB
2081 }
2082 if ( increment ) {
2083 ++absZ0;
2084 if ( absZ0 == 0 ) goto overflow;
bb98fe42 2085 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
2086 }
2087 z = absZ0;
2088 if ( zSign ) z = - z;
2089 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2090 overflow:
ff32e16e 2091 float_raise(float_flag_invalid, status);
158142c2 2092 return
bb98fe42 2093 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
2094 : LIT64( 0x7FFFFFFFFFFFFFFF );
2095 }
a2f2d288
PM
2096 if (absZ1) {
2097 status->float_exception_flags |= float_flag_inexact;
2098 }
158142c2
FB
2099 return z;
2100
2101}
2102
fb3ea83a
TM
2103/*----------------------------------------------------------------------------
2104| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2105| `absZ1', with binary point between bits 63 and 64 (between the input words),
2106| and returns the properly rounded 64-bit unsigned integer corresponding to the
2107| input. Ordinarily, the fixed-point input is simply rounded to an integer,
2108| with the inexact exception raised if the input cannot be represented exactly
2109| as an integer. However, if the fixed-point input is too large, the invalid
2110| exception is raised and the largest unsigned integer is returned.
2111*----------------------------------------------------------------------------*/
2112
f42c2224 2113static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
e5a41ffa 2114 uint64_t absZ1, float_status *status)
fb3ea83a 2115{
8f506c70 2116 int8_t roundingMode;
fb3ea83a
TM
2117 flag roundNearestEven, increment;
2118
a2f2d288 2119 roundingMode = status->float_rounding_mode;
fb3ea83a 2120 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
2121 switch (roundingMode) {
2122 case float_round_nearest_even:
f9288a76 2123 case float_round_ties_away:
dc355b76
PM
2124 increment = ((int64_t)absZ1 < 0);
2125 break;
2126 case float_round_to_zero:
2127 increment = 0;
2128 break;
2129 case float_round_up:
2130 increment = !zSign && absZ1;
2131 break;
2132 case float_round_down:
2133 increment = zSign && absZ1;
2134 break;
2135 default:
2136 abort();
fb3ea83a
TM
2137 }
2138 if (increment) {
2139 ++absZ0;
2140 if (absZ0 == 0) {
ff32e16e 2141 float_raise(float_flag_invalid, status);
fb3ea83a
TM
2142 return LIT64(0xFFFFFFFFFFFFFFFF);
2143 }
2144 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2145 }
2146
2147 if (zSign && absZ0) {
ff32e16e 2148 float_raise(float_flag_invalid, status);
fb3ea83a
TM
2149 return 0;
2150 }
2151
2152 if (absZ1) {
a2f2d288 2153 status->float_exception_flags |= float_flag_inexact;
fb3ea83a
TM
2154 }
2155 return absZ0;
2156}
2157
37d18660
PM
2158/*----------------------------------------------------------------------------
2159| If `a' is denormal and we are in flush-to-zero mode then set the
2160| input-denormal exception and return zero. Otherwise just return the value.
2161*----------------------------------------------------------------------------*/
e5a41ffa 2162float32 float32_squash_input_denormal(float32 a, float_status *status)
37d18660 2163{
a2f2d288 2164 if (status->flush_inputs_to_zero) {
37d18660 2165 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
ff32e16e 2166 float_raise(float_flag_input_denormal, status);
37d18660
PM
2167 return make_float32(float32_val(a) & 0x80000000);
2168 }
2169 }
2170 return a;
2171}
2172
158142c2
FB
2173/*----------------------------------------------------------------------------
2174| Normalizes the subnormal single-precision floating-point value represented
2175| by the denormalized significand `aSig'. The normalized exponent and
2176| significand are stored at the locations pointed to by `zExpPtr' and
2177| `zSigPtr', respectively.
2178*----------------------------------------------------------------------------*/
2179
2180static void
0c48262d 2181 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
158142c2 2182{
8f506c70 2183 int8_t shiftCount;
158142c2
FB
2184
2185 shiftCount = countLeadingZeros32( aSig ) - 8;
2186 *zSigPtr = aSig<<shiftCount;
2187 *zExpPtr = 1 - shiftCount;
2188
2189}
2190
158142c2
FB
2191/*----------------------------------------------------------------------------
2192| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2193| and significand `zSig', and returns the proper single-precision floating-
2194| point value corresponding to the abstract input. Ordinarily, the abstract
2195| value is simply rounded and packed into the single-precision format, with
2196| the inexact exception raised if the abstract input cannot be represented
2197| exactly. However, if the abstract value is too large, the overflow and
2198| inexact exceptions are raised and an infinity or maximal finite value is
2199| returned. If the abstract value is too small, the input value is rounded to
2200| a subnormal number, and the underflow and inexact exceptions are raised if
2201| the abstract input cannot be represented exactly as a subnormal single-
2202| precision floating-point number.
2203| The input significand `zSig' has its binary point between bits 30
2204| and 29, which is 7 bits to the left of the usual location. This shifted
2205| significand must be normalized or smaller. If `zSig' is not normalized,
2206| `zExp' must be 0; in that case, the result returned is a subnormal number,
2207| and it must not require rounding. In the usual case that `zSig' is
2208| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2209| The handling of underflow and overflow follows the IEC/IEEE Standard for
2210| Binary Floating-Point Arithmetic.
2211*----------------------------------------------------------------------------*/
2212
0c48262d 2213static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 2214 float_status *status)
158142c2 2215{
8f506c70 2216 int8_t roundingMode;
158142c2 2217 flag roundNearestEven;
8f506c70 2218 int8_t roundIncrement, roundBits;
158142c2
FB
2219 flag isTiny;
2220
a2f2d288 2221 roundingMode = status->float_rounding_mode;
158142c2 2222 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2223 switch (roundingMode) {
2224 case float_round_nearest_even:
f9288a76 2225 case float_round_ties_away:
dc355b76
PM
2226 roundIncrement = 0x40;
2227 break;
2228 case float_round_to_zero:
2229 roundIncrement = 0;
2230 break;
2231 case float_round_up:
2232 roundIncrement = zSign ? 0 : 0x7f;
2233 break;
2234 case float_round_down:
2235 roundIncrement = zSign ? 0x7f : 0;
2236 break;
2237 default:
2238 abort();
2239 break;
158142c2
FB
2240 }
2241 roundBits = zSig & 0x7F;
bb98fe42 2242 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
2243 if ( ( 0xFD < zExp )
2244 || ( ( zExp == 0xFD )
bb98fe42 2245 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 2246 ) {
ff32e16e 2247 float_raise(float_flag_overflow | float_flag_inexact, status);
f090c9d4 2248 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
2249 }
2250 if ( zExp < 0 ) {
a2f2d288 2251 if (status->flush_to_zero) {
ff32e16e 2252 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2253 return packFloat32(zSign, 0, 0);
2254 }
158142c2 2255 isTiny =
a2f2d288
PM
2256 (status->float_detect_tininess
2257 == float_tininess_before_rounding)
158142c2
FB
2258 || ( zExp < -1 )
2259 || ( zSig + roundIncrement < 0x80000000 );
2260 shift32RightJamming( zSig, - zExp, &zSig );
2261 zExp = 0;
2262 roundBits = zSig & 0x7F;
ff32e16e
PM
2263 if (isTiny && roundBits) {
2264 float_raise(float_flag_underflow, status);
2265 }
158142c2
FB
2266 }
2267 }
a2f2d288
PM
2268 if (roundBits) {
2269 status->float_exception_flags |= float_flag_inexact;
2270 }
158142c2
FB
2271 zSig = ( zSig + roundIncrement )>>7;
2272 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2273 if ( zSig == 0 ) zExp = 0;
2274 return packFloat32( zSign, zExp, zSig );
2275
2276}
2277
2278/*----------------------------------------------------------------------------
2279| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2280| and significand `zSig', and returns the proper single-precision floating-
2281| point value corresponding to the abstract input. This routine is just like
2282| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2283| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2284| floating-point exponent.
2285*----------------------------------------------------------------------------*/
2286
2287static float32
0c48262d 2288 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
e5a41ffa 2289 float_status *status)
158142c2 2290{
8f506c70 2291 int8_t shiftCount;
158142c2
FB
2292
2293 shiftCount = countLeadingZeros32( zSig ) - 1;
ff32e16e
PM
2294 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2295 status);
158142c2
FB
2296
2297}
2298
37d18660
PM
2299/*----------------------------------------------------------------------------
2300| If `a' is denormal and we are in flush-to-zero mode then set the
2301| input-denormal exception and return zero. Otherwise just return the value.
2302*----------------------------------------------------------------------------*/
e5a41ffa 2303float64 float64_squash_input_denormal(float64 a, float_status *status)
37d18660 2304{
a2f2d288 2305 if (status->flush_inputs_to_zero) {
37d18660 2306 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
ff32e16e 2307 float_raise(float_flag_input_denormal, status);
37d18660
PM
2308 return make_float64(float64_val(a) & (1ULL << 63));
2309 }
2310 }
2311 return a;
2312}
2313
158142c2
FB
2314/*----------------------------------------------------------------------------
2315| Normalizes the subnormal double-precision floating-point value represented
2316| by the denormalized significand `aSig'. The normalized exponent and
2317| significand are stored at the locations pointed to by `zExpPtr' and
2318| `zSigPtr', respectively.
2319*----------------------------------------------------------------------------*/
2320
2321static void
0c48262d 2322 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
158142c2 2323{
8f506c70 2324 int8_t shiftCount;
158142c2
FB
2325
2326 shiftCount = countLeadingZeros64( aSig ) - 11;
2327 *zSigPtr = aSig<<shiftCount;
2328 *zExpPtr = 1 - shiftCount;
2329
2330}
2331
2332/*----------------------------------------------------------------------------
2333| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2334| double-precision floating-point value, returning the result. After being
2335| shifted into the proper positions, the three fields are simply added
2336| together to form the result. This means that any integer portion of `zSig'
2337| will be added into the exponent. Since a properly normalized significand
2338| will have an integer portion equal to 1, the `zExp' input should be 1 less
2339| than the desired result exponent whenever `zSig' is a complete, normalized
2340| significand.
2341*----------------------------------------------------------------------------*/
2342
0c48262d 2343static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
158142c2
FB
2344{
2345
f090c9d4 2346 return make_float64(
bb98fe42 2347 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
2348
2349}
2350
2351/*----------------------------------------------------------------------------
2352| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2353| and significand `zSig', and returns the proper double-precision floating-
2354| point value corresponding to the abstract input. Ordinarily, the abstract
2355| value is simply rounded and packed into the double-precision format, with
2356| the inexact exception raised if the abstract input cannot be represented
2357| exactly. However, if the abstract value is too large, the overflow and
2358| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
2359| returned. If the abstract value is too small, the input value is rounded to
2360| a subnormal number, and the underflow and inexact exceptions are raised if
2361| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
2362| precision floating-point number.
2363| The input significand `zSig' has its binary point between bits 62
2364| and 61, which is 10 bits to the left of the usual location. This shifted
2365| significand must be normalized or smaller. If `zSig' is not normalized,
2366| `zExp' must be 0; in that case, the result returned is a subnormal number,
2367| and it must not require rounding. In the usual case that `zSig' is
2368| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2369| The handling of underflow and overflow follows the IEC/IEEE Standard for
2370| Binary Floating-Point Arithmetic.
2371*----------------------------------------------------------------------------*/
2372
0c48262d 2373static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 2374 float_status *status)
158142c2 2375{
8f506c70 2376 int8_t roundingMode;
158142c2 2377 flag roundNearestEven;
0c48262d 2378 int roundIncrement, roundBits;
158142c2
FB
2379 flag isTiny;
2380
a2f2d288 2381 roundingMode = status->float_rounding_mode;
158142c2 2382 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2383 switch (roundingMode) {
2384 case float_round_nearest_even:
f9288a76 2385 case float_round_ties_away:
dc355b76
PM
2386 roundIncrement = 0x200;
2387 break;
2388 case float_round_to_zero:
2389 roundIncrement = 0;
2390 break;
2391 case float_round_up:
2392 roundIncrement = zSign ? 0 : 0x3ff;
2393 break;
2394 case float_round_down:
2395 roundIncrement = zSign ? 0x3ff : 0;
2396 break;
9ee6f678
BR
2397 case float_round_to_odd:
2398 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2399 break;
dc355b76
PM
2400 default:
2401 abort();
158142c2
FB
2402 }
2403 roundBits = zSig & 0x3FF;
bb98fe42 2404 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
2405 if ( ( 0x7FD < zExp )
2406 || ( ( zExp == 0x7FD )
bb98fe42 2407 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2 2408 ) {
9ee6f678
BR
2409 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2410 roundIncrement != 0;
ff32e16e 2411 float_raise(float_flag_overflow | float_flag_inexact, status);
9ee6f678 2412 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
158142c2
FB
2413 }
2414 if ( zExp < 0 ) {
a2f2d288 2415 if (status->flush_to_zero) {
ff32e16e 2416 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2417 return packFloat64(zSign, 0, 0);
2418 }
158142c2 2419 isTiny =
a2f2d288
PM
2420 (status->float_detect_tininess
2421 == float_tininess_before_rounding)
158142c2
FB
2422 || ( zExp < -1 )
2423 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2424 shift64RightJamming( zSig, - zExp, &zSig );
2425 zExp = 0;
2426 roundBits = zSig & 0x3FF;
ff32e16e
PM
2427 if (isTiny && roundBits) {
2428 float_raise(float_flag_underflow, status);
2429 }
9ee6f678
BR
2430 if (roundingMode == float_round_to_odd) {
2431 /*
2432 * For round-to-odd case, the roundIncrement depends on
2433 * zSig which just changed.
2434 */
2435 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2436 }
158142c2
FB
2437 }
2438 }
a2f2d288
PM
2439 if (roundBits) {
2440 status->float_exception_flags |= float_flag_inexact;
2441 }
158142c2
FB
2442 zSig = ( zSig + roundIncrement )>>10;
2443 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2444 if ( zSig == 0 ) zExp = 0;
2445 return packFloat64( zSign, zExp, zSig );
2446
2447}
2448
2449/*----------------------------------------------------------------------------
2450| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2451| and significand `zSig', and returns the proper double-precision floating-
2452| point value corresponding to the abstract input. This routine is just like
2453| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2454| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2455| floating-point exponent.
2456*----------------------------------------------------------------------------*/
2457
2458static float64
0c48262d 2459 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
e5a41ffa 2460 float_status *status)
158142c2 2461{
8f506c70 2462 int8_t shiftCount;
158142c2
FB
2463
2464 shiftCount = countLeadingZeros64( zSig ) - 1;
ff32e16e
PM
2465 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2466 status);
158142c2
FB
2467
2468}
2469
158142c2
FB
2470/*----------------------------------------------------------------------------
2471| Normalizes the subnormal extended double-precision floating-point value
2472| represented by the denormalized significand `aSig'. The normalized exponent
2473| and significand are stored at the locations pointed to by `zExpPtr' and
2474| `zSigPtr', respectively.
2475*----------------------------------------------------------------------------*/
2476
88857aca
LV
2477void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2478 uint64_t *zSigPtr)
158142c2 2479{
8f506c70 2480 int8_t shiftCount;
158142c2
FB
2481
2482 shiftCount = countLeadingZeros64( aSig );
2483 *zSigPtr = aSig<<shiftCount;
2484 *zExpPtr = 1 - shiftCount;
158142c2
FB
2485}
2486
2487/*----------------------------------------------------------------------------
2488| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2489| and extended significand formed by the concatenation of `zSig0' and `zSig1',
2490| and returns the proper extended double-precision floating-point value
2491| corresponding to the abstract input. Ordinarily, the abstract value is
2492| rounded and packed into the extended double-precision format, with the
2493| inexact exception raised if the abstract input cannot be represented
2494| exactly. However, if the abstract value is too large, the overflow and
2495| inexact exceptions are raised and an infinity or maximal finite value is
2496| returned. If the abstract value is too small, the input value is rounded to
2497| a subnormal number, and the underflow and inexact exceptions are raised if
2498| the abstract input cannot be represented exactly as a subnormal extended
2499| double-precision floating-point number.
2500| If `roundingPrecision' is 32 or 64, the result is rounded to the same
2501| number of bits as single or double precision, respectively. Otherwise, the
2502| result is rounded to the full precision of the extended double-precision
2503| format.
2504| The input significand must be normalized or smaller. If the input
2505| significand is not normalized, `zExp' must be 0; in that case, the result
2506| returned is a subnormal number, and it must not require rounding. The
2507| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2508| Floating-Point Arithmetic.
2509*----------------------------------------------------------------------------*/
2510
88857aca
LV
2511floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2512 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2513 float_status *status)
158142c2 2514{
8f506c70 2515 int8_t roundingMode;
158142c2 2516 flag roundNearestEven, increment, isTiny;
f42c2224 2517 int64_t roundIncrement, roundMask, roundBits;
158142c2 2518
a2f2d288 2519 roundingMode = status->float_rounding_mode;
158142c2
FB
2520 roundNearestEven = ( roundingMode == float_round_nearest_even );
2521 if ( roundingPrecision == 80 ) goto precision80;
2522 if ( roundingPrecision == 64 ) {
2523 roundIncrement = LIT64( 0x0000000000000400 );
2524 roundMask = LIT64( 0x00000000000007FF );
2525 }
2526 else if ( roundingPrecision == 32 ) {
2527 roundIncrement = LIT64( 0x0000008000000000 );
2528 roundMask = LIT64( 0x000000FFFFFFFFFF );
2529 }
2530 else {
2531 goto precision80;
2532 }
2533 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
2534 switch (roundingMode) {
2535 case float_round_nearest_even:
f9288a76 2536 case float_round_ties_away:
dc355b76
PM
2537 break;
2538 case float_round_to_zero:
2539 roundIncrement = 0;
2540 break;
2541 case float_round_up:
2542 roundIncrement = zSign ? 0 : roundMask;
2543 break;
2544 case float_round_down:
2545 roundIncrement = zSign ? roundMask : 0;
2546 break;
2547 default:
2548 abort();
158142c2
FB
2549 }
2550 roundBits = zSig0 & roundMask;
bb98fe42 2551 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2552 if ( ( 0x7FFE < zExp )
2553 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2554 ) {
2555 goto overflow;
2556 }
2557 if ( zExp <= 0 ) {
a2f2d288 2558 if (status->flush_to_zero) {
ff32e16e 2559 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2560 return packFloatx80(zSign, 0, 0);
2561 }
158142c2 2562 isTiny =
a2f2d288
PM
2563 (status->float_detect_tininess
2564 == float_tininess_before_rounding)
158142c2
FB
2565 || ( zExp < 0 )
2566 || ( zSig0 <= zSig0 + roundIncrement );
2567 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2568 zExp = 0;
2569 roundBits = zSig0 & roundMask;
ff32e16e
PM
2570 if (isTiny && roundBits) {
2571 float_raise(float_flag_underflow, status);
2572 }
a2f2d288
PM
2573 if (roundBits) {
2574 status->float_exception_flags |= float_flag_inexact;
2575 }
158142c2 2576 zSig0 += roundIncrement;
bb98fe42 2577 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2578 roundIncrement = roundMask + 1;
2579 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2580 roundMask |= roundIncrement;
2581 }
2582 zSig0 &= ~ roundMask;
2583 return packFloatx80( zSign, zExp, zSig0 );
2584 }
2585 }
a2f2d288
PM
2586 if (roundBits) {
2587 status->float_exception_flags |= float_flag_inexact;
2588 }
158142c2
FB
2589 zSig0 += roundIncrement;
2590 if ( zSig0 < roundIncrement ) {
2591 ++zExp;
2592 zSig0 = LIT64( 0x8000000000000000 );
2593 }
2594 roundIncrement = roundMask + 1;
2595 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2596 roundMask |= roundIncrement;
2597 }
2598 zSig0 &= ~ roundMask;
2599 if ( zSig0 == 0 ) zExp = 0;
2600 return packFloatx80( zSign, zExp, zSig0 );
2601 precision80:
dc355b76
PM
2602 switch (roundingMode) {
2603 case float_round_nearest_even:
f9288a76 2604 case float_round_ties_away:
dc355b76
PM
2605 increment = ((int64_t)zSig1 < 0);
2606 break;
2607 case float_round_to_zero:
2608 increment = 0;
2609 break;
2610 case float_round_up:
2611 increment = !zSign && zSig1;
2612 break;
2613 case float_round_down:
2614 increment = zSign && zSig1;
2615 break;
2616 default:
2617 abort();
158142c2 2618 }
bb98fe42 2619 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
2620 if ( ( 0x7FFE < zExp )
2621 || ( ( zExp == 0x7FFE )
2622 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2623 && increment
2624 )
2625 ) {
2626 roundMask = 0;
2627 overflow:
ff32e16e 2628 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2629 if ( ( roundingMode == float_round_to_zero )
2630 || ( zSign && ( roundingMode == float_round_up ) )
2631 || ( ! zSign && ( roundingMode == float_round_down ) )
2632 ) {
2633 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2634 }
0f605c88
LV
2635 return packFloatx80(zSign,
2636 floatx80_infinity_high,
2637 floatx80_infinity_low);
158142c2
FB
2638 }
2639 if ( zExp <= 0 ) {
2640 isTiny =
a2f2d288
PM
2641 (status->float_detect_tininess
2642 == float_tininess_before_rounding)
158142c2
FB
2643 || ( zExp < 0 )
2644 || ! increment
2645 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2646 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2647 zExp = 0;
ff32e16e
PM
2648 if (isTiny && zSig1) {
2649 float_raise(float_flag_underflow, status);
2650 }
a2f2d288
PM
2651 if (zSig1) {
2652 status->float_exception_flags |= float_flag_inexact;
2653 }
dc355b76
PM
2654 switch (roundingMode) {
2655 case float_round_nearest_even:
f9288a76 2656 case float_round_ties_away:
dc355b76
PM
2657 increment = ((int64_t)zSig1 < 0);
2658 break;
2659 case float_round_to_zero:
2660 increment = 0;
2661 break;
2662 case float_round_up:
2663 increment = !zSign && zSig1;
2664 break;
2665 case float_round_down:
2666 increment = zSign && zSig1;
2667 break;
2668 default:
2669 abort();
158142c2
FB
2670 }
2671 if ( increment ) {
2672 ++zSig0;
2673 zSig0 &=
bb98fe42
AF
2674 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2675 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
2676 }
2677 return packFloatx80( zSign, zExp, zSig0 );
2678 }
2679 }
a2f2d288
PM
2680 if (zSig1) {
2681 status->float_exception_flags |= float_flag_inexact;
2682 }
158142c2
FB
2683 if ( increment ) {
2684 ++zSig0;
2685 if ( zSig0 == 0 ) {
2686 ++zExp;
2687 zSig0 = LIT64( 0x8000000000000000 );
2688 }
2689 else {
bb98fe42 2690 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
2691 }
2692 }
2693 else {
2694 if ( zSig0 == 0 ) zExp = 0;
2695 }
2696 return packFloatx80( zSign, zExp, zSig0 );
2697
2698}
2699
2700/*----------------------------------------------------------------------------
2701| Takes an abstract floating-point value having sign `zSign', exponent
2702| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2703| and returns the proper extended double-precision floating-point value
2704| corresponding to the abstract input. This routine is just like
2705| `roundAndPackFloatx80' except that the input significand does not have to be
2706| normalized.
2707*----------------------------------------------------------------------------*/
2708
88857aca
LV
2709floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2710 flag zSign, int32_t zExp,
2711 uint64_t zSig0, uint64_t zSig1,
2712 float_status *status)
158142c2 2713{
8f506c70 2714 int8_t shiftCount;
158142c2
FB
2715
2716 if ( zSig0 == 0 ) {
2717 zSig0 = zSig1;
2718 zSig1 = 0;
2719 zExp -= 64;
2720 }
2721 shiftCount = countLeadingZeros64( zSig0 );
2722 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2723 zExp -= shiftCount;
ff32e16e
PM
2724 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2725 zSig0, zSig1, status);
158142c2
FB
2726
2727}
2728
158142c2
FB
2729/*----------------------------------------------------------------------------
2730| Returns the least-significant 64 fraction bits of the quadruple-precision
2731| floating-point value `a'.
2732*----------------------------------------------------------------------------*/
2733
a49db98d 2734static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
2735{
2736
2737 return a.low;
2738
2739}
2740
2741/*----------------------------------------------------------------------------
2742| Returns the most-significant 48 fraction bits of the quadruple-precision
2743| floating-point value `a'.
2744*----------------------------------------------------------------------------*/
2745
a49db98d 2746static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
2747{
2748
2749 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2750
2751}
2752
2753/*----------------------------------------------------------------------------
2754| Returns the exponent bits of the quadruple-precision floating-point value
2755| `a'.
2756*----------------------------------------------------------------------------*/
2757
f4014512 2758static inline int32_t extractFloat128Exp( float128 a )
158142c2
FB
2759{
2760
2761 return ( a.high>>48 ) & 0x7FFF;
2762
2763}
2764
2765/*----------------------------------------------------------------------------
2766| Returns the sign bit of the quadruple-precision floating-point value `a'.
2767*----------------------------------------------------------------------------*/
2768
a49db98d 2769static inline flag extractFloat128Sign( float128 a )
158142c2
FB
2770{
2771
2772 return a.high>>63;
2773
2774}
2775
2776/*----------------------------------------------------------------------------
2777| Normalizes the subnormal quadruple-precision floating-point value
2778| represented by the denormalized significand formed by the concatenation of
2779| `aSig0' and `aSig1'. The normalized exponent is stored at the location
2780| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2781| significand are stored at the location pointed to by `zSig0Ptr', and the
2782| least significant 64 bits of the normalized significand are stored at the
2783| location pointed to by `zSig1Ptr'.
2784*----------------------------------------------------------------------------*/
2785
2786static void
2787 normalizeFloat128Subnormal(
bb98fe42
AF
2788 uint64_t aSig0,
2789 uint64_t aSig1,
f4014512 2790 int32_t *zExpPtr,
bb98fe42
AF
2791 uint64_t *zSig0Ptr,
2792 uint64_t *zSig1Ptr
158142c2
FB
2793 )
2794{
8f506c70 2795 int8_t shiftCount;
158142c2
FB
2796
2797 if ( aSig0 == 0 ) {
2798 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2799 if ( shiftCount < 0 ) {
2800 *zSig0Ptr = aSig1>>( - shiftCount );
2801 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2802 }
2803 else {
2804 *zSig0Ptr = aSig1<<shiftCount;
2805 *zSig1Ptr = 0;
2806 }
2807 *zExpPtr = - shiftCount - 63;
2808 }
2809 else {
2810 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2811 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2812 *zExpPtr = 1 - shiftCount;
2813 }
2814
2815}
2816
2817/*----------------------------------------------------------------------------
2818| Packs the sign `zSign', the exponent `zExp', and the significand formed
2819| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2820| floating-point value, returning the result. After being shifted into the
2821| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2822| added together to form the most significant 32 bits of the result. This
2823| means that any integer portion of `zSig0' will be added into the exponent.
2824| Since a properly normalized significand will have an integer portion equal
2825| to 1, the `zExp' input should be 1 less than the desired result exponent
2826| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2827| significand.
2828*----------------------------------------------------------------------------*/
2829
a49db98d 2830static inline float128
f4014512 2831 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
2832{
2833 float128 z;
2834
2835 z.low = zSig1;
bb98fe42 2836 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
2837 return z;
2838
2839}
2840
2841/*----------------------------------------------------------------------------
2842| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2843| and extended significand formed by the concatenation of `zSig0', `zSig1',
2844| and `zSig2', and returns the proper quadruple-precision floating-point value
2845| corresponding to the abstract input. Ordinarily, the abstract value is
2846| simply rounded and packed into the quadruple-precision format, with the
2847| inexact exception raised if the abstract input cannot be represented
2848| exactly. However, if the abstract value is too large, the overflow and
2849| inexact exceptions are raised and an infinity or maximal finite value is
2850| returned. If the abstract value is too small, the input value is rounded to
2851| a subnormal number, and the underflow and inexact exceptions are raised if
2852| the abstract input cannot be represented exactly as a subnormal quadruple-
2853| precision floating-point number.
2854| The input significand must be normalized or smaller. If the input
2855| significand is not normalized, `zExp' must be 0; in that case, the result
2856| returned is a subnormal number, and it must not require rounding. In the
2857| usual case that the input significand is normalized, `zExp' must be 1 less
2858| than the ``true'' floating-point exponent. The handling of underflow and
2859| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2860*----------------------------------------------------------------------------*/
2861
f4014512 2862static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2863 uint64_t zSig0, uint64_t zSig1,
2864 uint64_t zSig2, float_status *status)
158142c2 2865{
8f506c70 2866 int8_t roundingMode;
158142c2
FB
2867 flag roundNearestEven, increment, isTiny;
2868
a2f2d288 2869 roundingMode = status->float_rounding_mode;
158142c2 2870 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
2871 switch (roundingMode) {
2872 case float_round_nearest_even:
f9288a76 2873 case float_round_ties_away:
dc355b76
PM
2874 increment = ((int64_t)zSig2 < 0);
2875 break;
2876 case float_round_to_zero:
2877 increment = 0;
2878 break;
2879 case float_round_up:
2880 increment = !zSign && zSig2;
2881 break;
2882 case float_round_down:
2883 increment = zSign && zSig2;
2884 break;
9ee6f678
BR
2885 case float_round_to_odd:
2886 increment = !(zSig1 & 0x1) && zSig2;
2887 break;
dc355b76
PM
2888 default:
2889 abort();
158142c2 2890 }
bb98fe42 2891 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
2892 if ( ( 0x7FFD < zExp )
2893 || ( ( zExp == 0x7FFD )
2894 && eq128(
2895 LIT64( 0x0001FFFFFFFFFFFF ),
2896 LIT64( 0xFFFFFFFFFFFFFFFF ),
2897 zSig0,
2898 zSig1
2899 )
2900 && increment
2901 )
2902 ) {
ff32e16e 2903 float_raise(float_flag_overflow | float_flag_inexact, status);
158142c2
FB
2904 if ( ( roundingMode == float_round_to_zero )
2905 || ( zSign && ( roundingMode == float_round_up ) )
2906 || ( ! zSign && ( roundingMode == float_round_down ) )
9ee6f678 2907 || (roundingMode == float_round_to_odd)
158142c2
FB
2908 ) {
2909 return
2910 packFloat128(
2911 zSign,
2912 0x7FFE,
2913 LIT64( 0x0000FFFFFFFFFFFF ),
2914 LIT64( 0xFFFFFFFFFFFFFFFF )
2915 );
2916 }
2917 return packFloat128( zSign, 0x7FFF, 0, 0 );
2918 }
2919 if ( zExp < 0 ) {
a2f2d288 2920 if (status->flush_to_zero) {
ff32e16e 2921 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
2922 return packFloat128(zSign, 0, 0, 0);
2923 }
158142c2 2924 isTiny =
a2f2d288
PM
2925 (status->float_detect_tininess
2926 == float_tininess_before_rounding)
158142c2
FB
2927 || ( zExp < -1 )
2928 || ! increment
2929 || lt128(
2930 zSig0,
2931 zSig1,
2932 LIT64( 0x0001FFFFFFFFFFFF ),
2933 LIT64( 0xFFFFFFFFFFFFFFFF )
2934 );
2935 shift128ExtraRightJamming(
2936 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2937 zExp = 0;
ff32e16e
PM
2938 if (isTiny && zSig2) {
2939 float_raise(float_flag_underflow, status);
2940 }
dc355b76
PM
2941 switch (roundingMode) {
2942 case float_round_nearest_even:
f9288a76 2943 case float_round_ties_away:
dc355b76
PM
2944 increment = ((int64_t)zSig2 < 0);
2945 break;
2946 case float_round_to_zero:
2947 increment = 0;
2948 break;
2949 case float_round_up:
2950 increment = !zSign && zSig2;
2951 break;
2952 case float_round_down:
2953 increment = zSign && zSig2;
2954 break;
9ee6f678
BR
2955 case float_round_to_odd:
2956 increment = !(zSig1 & 0x1) && zSig2;
2957 break;
dc355b76
PM
2958 default:
2959 abort();
158142c2
FB
2960 }
2961 }
2962 }
a2f2d288
PM
2963 if (zSig2) {
2964 status->float_exception_flags |= float_flag_inexact;
2965 }
158142c2
FB
2966 if ( increment ) {
2967 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2968 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2969 }
2970 else {
2971 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2972 }
2973 return packFloat128( zSign, zExp, zSig0, zSig1 );
2974
2975}
2976
2977/*----------------------------------------------------------------------------
2978| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2979| and significand formed by the concatenation of `zSig0' and `zSig1', and
2980| returns the proper quadruple-precision floating-point value corresponding
2981| to the abstract input. This routine is just like `roundAndPackFloat128'
2982| except that the input significand has fewer bits and does not have to be
2983| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2984| point exponent.
2985*----------------------------------------------------------------------------*/
2986
f4014512 2987static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
e5a41ffa
PM
2988 uint64_t zSig0, uint64_t zSig1,
2989 float_status *status)
158142c2 2990{
8f506c70 2991 int8_t shiftCount;
bb98fe42 2992 uint64_t zSig2;
158142c2
FB
2993
2994 if ( zSig0 == 0 ) {
2995 zSig0 = zSig1;
2996 zSig1 = 0;
2997 zExp -= 64;
2998 }
2999 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3000 if ( 0 <= shiftCount ) {
3001 zSig2 = 0;
3002 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3003 }
3004 else {
3005 shift128ExtraRightJamming(
3006 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3007 }
3008 zExp -= shiftCount;
ff32e16e 3009 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
3010
3011}
3012
158142c2 3013
158142c2
FB
3014/*----------------------------------------------------------------------------
3015| Returns the result of converting the 32-bit two's complement integer `a'
3016| to the extended double-precision floating-point format. The conversion
3017| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3018| Arithmetic.
3019*----------------------------------------------------------------------------*/
3020
e5a41ffa 3021floatx80 int32_to_floatx80(int32_t a, float_status *status)
158142c2
FB
3022{
3023 flag zSign;
3a87d009 3024 uint32_t absA;
8f506c70 3025 int8_t shiftCount;
bb98fe42 3026 uint64_t zSig;
158142c2
FB
3027
3028 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3029 zSign = ( a < 0 );
3030 absA = zSign ? - a : a;
3031 shiftCount = countLeadingZeros32( absA ) + 32;
3032 zSig = absA;
3033 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3034
3035}
3036
158142c2
FB
3037/*----------------------------------------------------------------------------
3038| Returns the result of converting the 32-bit two's complement integer `a' to
3039| the quadruple-precision floating-point format. The conversion is performed
3040| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3041*----------------------------------------------------------------------------*/
3042
e5a41ffa 3043float128 int32_to_float128(int32_t a, float_status *status)
158142c2
FB
3044{
3045 flag zSign;
3a87d009 3046 uint32_t absA;
8f506c70 3047 int8_t shiftCount;
bb98fe42 3048 uint64_t zSig0;
158142c2
FB
3049
3050 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3051 zSign = ( a < 0 );
3052 absA = zSign ? - a : a;
3053 shiftCount = countLeadingZeros32( absA ) + 17;
3054 zSig0 = absA;
3055 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3056
3057}
3058
158142c2
FB
3059/*----------------------------------------------------------------------------
3060| Returns the result of converting the 64-bit two's complement integer `a'
3061| to the extended double-precision floating-point format. The conversion
3062| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3063| Arithmetic.
3064*----------------------------------------------------------------------------*/
3065
e5a41ffa 3066floatx80 int64_to_floatx80(int64_t a, float_status *status)
158142c2
FB
3067{
3068 flag zSign;
182f42fd 3069 uint64_t absA;
8f506c70 3070 int8_t shiftCount;
158142c2
FB
3071
3072 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3073 zSign = ( a < 0 );
3074 absA = zSign ? - a : a;
3075 shiftCount = countLeadingZeros64( absA );
3076 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3077
3078}
3079
158142c2
FB
3080/*----------------------------------------------------------------------------
3081| Returns the result of converting the 64-bit two's complement integer `a' to
3082| the quadruple-precision floating-point format. The conversion is performed
3083| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3084*----------------------------------------------------------------------------*/
3085
e5a41ffa 3086float128 int64_to_float128(int64_t a, float_status *status)
158142c2
FB
3087{
3088 flag zSign;
182f42fd 3089 uint64_t absA;
8f506c70 3090 int8_t shiftCount;
f4014512 3091 int32_t zExp;
bb98fe42 3092 uint64_t zSig0, zSig1;
158142c2
FB
3093
3094 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3095 zSign = ( a < 0 );
3096 absA = zSign ? - a : a;
3097 shiftCount = countLeadingZeros64( absA ) + 49;
3098 zExp = 0x406E - shiftCount;
3099 if ( 64 <= shiftCount ) {
3100 zSig1 = 0;
3101 zSig0 = absA;
3102 shiftCount -= 64;
3103 }
3104 else {
3105 zSig1 = absA;
3106 zSig0 = 0;
3107 }
3108 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3109 return packFloat128( zSign, zExp, zSig0, zSig1 );
3110
3111}
3112
6bb8e0f1
PM
3113/*----------------------------------------------------------------------------
3114| Returns the result of converting the 64-bit unsigned integer `a'
3115| to the quadruple-precision floating-point format. The conversion is performed
3116| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3117*----------------------------------------------------------------------------*/
3118
e5a41ffa 3119float128 uint64_to_float128(uint64_t a, float_status *status)
1e397ead
RH
3120{
3121 if (a == 0) {
3122 return float128_zero;
3123 }
6603d506 3124 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
1e397ead
RH
3125}
3126
158142c2 3127
158142c2 3128
158142c2
FB
3129
3130/*----------------------------------------------------------------------------
3131| Returns the result of converting the single-precision floating-point value
3132| `a' to the double-precision floating-point format. The conversion is
3133| performed according to the IEC/IEEE Standard for Binary Floating-Point
3134| Arithmetic.
3135*----------------------------------------------------------------------------*/
3136
e5a41ffa 3137float64 float32_to_float64(float32 a, float_status *status)
158142c2
FB
3138{
3139 flag aSign;
0c48262d 3140 int aExp;
bb98fe42 3141 uint32_t aSig;
ff32e16e 3142 a = float32_squash_input_denormal(a, status);
158142c2
FB
3143
3144 aSig = extractFloat32Frac( a );
3145 aExp = extractFloat32Exp( a );
3146 aSign = extractFloat32Sign( a );
3147 if ( aExp == 0xFF ) {
ff32e16e
PM
3148 if (aSig) {
3149 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3150 }
158142c2
FB
3151 return packFloat64( aSign, 0x7FF, 0 );
3152 }
3153 if ( aExp == 0 ) {
3154 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3155 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3156 --aExp;
3157 }
bb98fe42 3158 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
3159
3160}
3161
158142c2
FB
3162/*----------------------------------------------------------------------------
3163| Returns the result of converting the single-precision floating-point value
3164| `a' to the extended double-precision floating-point format. The conversion
3165| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3166| Arithmetic.
3167*----------------------------------------------------------------------------*/
3168
e5a41ffa 3169floatx80 float32_to_floatx80(float32 a, float_status *status)
158142c2
FB
3170{
3171 flag aSign;
0c48262d 3172 int aExp;
bb98fe42 3173 uint32_t aSig;
158142c2 3174
ff32e16e 3175 a = float32_squash_input_denormal(a, status);
158142c2
FB
3176 aSig = extractFloat32Frac( a );
3177 aExp = extractFloat32Exp( a );
3178 aSign = extractFloat32Sign( a );
3179 if ( aExp == 0xFF ) {
ff32e16e
PM
3180 if (aSig) {
3181 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3182 }
0f605c88
LV
3183 return packFloatx80(aSign,
3184 floatx80_infinity_high,
3185 floatx80_infinity_low);
158142c2
FB
3186 }
3187 if ( aExp == 0 ) {
3188 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3189 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3190 }
3191 aSig |= 0x00800000;
bb98fe42 3192 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
3193
3194}
3195
158142c2
FB
3196/*----------------------------------------------------------------------------
3197| Returns the result of converting the single-precision floating-point value
3198| `a' to the double-precision floating-point format. The conversion is
3199| performed according to the IEC/IEEE Standard for Binary Floating-Point
3200| Arithmetic.
3201*----------------------------------------------------------------------------*/
3202
e5a41ffa 3203float128 float32_to_float128(float32 a, float_status *status)
158142c2
FB
3204{
3205 flag aSign;
0c48262d 3206 int aExp;
bb98fe42 3207 uint32_t aSig;
158142c2 3208
ff32e16e 3209 a = float32_squash_input_denormal(a, status);
158142c2
FB
3210 aSig = extractFloat32Frac( a );
3211 aExp = extractFloat32Exp( a );
3212 aSign = extractFloat32Sign( a );
3213 if ( aExp == 0xFF ) {
ff32e16e
PM
3214 if (aSig) {
3215 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3216 }
158142c2
FB
3217 return packFloat128( aSign, 0x7FFF, 0, 0 );
3218 }
3219 if ( aExp == 0 ) {
3220 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3221 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3222 --aExp;
3223 }
bb98fe42 3224 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
3225
3226}
3227
158142c2
FB
3228/*----------------------------------------------------------------------------
3229| Returns the remainder of the single-precision floating-point value `a'
3230| with respect to the corresponding value `b'. The operation is performed
3231| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3232*----------------------------------------------------------------------------*/
3233
e5a41ffa 3234float32 float32_rem(float32 a, float32 b, float_status *status)
158142c2 3235{
ed086f3d 3236 flag aSign, zSign;
0c48262d 3237 int aExp, bExp, expDiff;
bb98fe42
AF
3238 uint32_t aSig, bSig;
3239 uint32_t q;
3240 uint64_t aSig64, bSig64, q64;
3241 uint32_t alternateASig;
3242 int32_t sigMean;
ff32e16e
PM
3243 a = float32_squash_input_denormal(a, status);
3244 b = float32_squash_input_denormal(b, status);
158142c2
FB
3245
3246 aSig = extractFloat32Frac( a );
3247 aExp = extractFloat32Exp( a );
3248 aSign = extractFloat32Sign( a );
3249 bSig = extractFloat32Frac( b );
3250 bExp = extractFloat32Exp( b );
158142c2
FB
3251 if ( aExp == 0xFF ) {
3252 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
ff32e16e 3253 return propagateFloat32NaN(a, b, status);
158142c2 3254 }
ff32e16e 3255 float_raise(float_flag_invalid, status);
af39bc8c 3256 return float32_default_nan(status);
158142c2
FB
3257 }
3258 if ( bExp == 0xFF ) {
ff32e16e
PM
3259 if (bSig) {
3260 return propagateFloat32NaN(a, b, status);
3261 }
158142c2
FB
3262 return a;
3263 }
3264 if ( bExp == 0 ) {
3265 if ( bSig == 0 ) {
ff32e16e 3266 float_raise(float_flag_invalid, status);
af39bc8c 3267 return float32_default_nan(status);
158142c2
FB
3268 }
3269 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3270 }
3271 if ( aExp == 0 ) {
3272 if ( aSig == 0 ) return a;
3273 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3274 }
3275 expDiff = aExp - bExp;
3276 aSig |= 0x00800000;
3277 bSig |= 0x00800000;
3278 if ( expDiff < 32 ) {
3279 aSig <<= 8;
3280 bSig <<= 8;
3281 if ( expDiff < 0 ) {
3282 if ( expDiff < -1 ) return a;
3283 aSig >>= 1;
3284 }
3285 q = ( bSig <= aSig );
3286 if ( q ) aSig -= bSig;
3287 if ( 0 < expDiff ) {
bb98fe42 3288 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
3289 q >>= 32 - expDiff;
3290 bSig >>= 2;
3291 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3292 }
3293 else {
3294 aSig >>= 2;
3295 bSig >>= 2;
3296 }
3297 }
3298 else {
3299 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
3300 aSig64 = ( (uint64_t) aSig )<<40;
3301 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
3302 expDiff -= 64;
3303 while ( 0 < expDiff ) {
3304 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3305 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3306 aSig64 = - ( ( bSig * q64 )<<38 );
3307 expDiff -= 62;
3308 }
3309 expDiff += 64;
3310 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3311 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3312 q = q64>>( 64 - expDiff );
3313 bSig <<= 6;
3314 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3315 }
3316 do {
3317 alternateASig = aSig;
3318 ++q;
3319 aSig -= bSig;
bb98fe42 3320 } while ( 0 <= (int32_t) aSig );
158142c2
FB
3321 sigMean = aSig + alternateASig;
3322 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3323 aSig = alternateASig;
3324 }
bb98fe42 3325 zSign = ( (int32_t) aSig < 0 );
158142c2 3326 if ( zSign ) aSig = - aSig;
ff32e16e 3327 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
3328}
3329
369be8f6 3330
158142c2 3331
8229c991
AJ
3332/*----------------------------------------------------------------------------
3333| Returns the binary exponential of the single-precision floating-point value
3334| `a'. The operation is performed according to the IEC/IEEE Standard for
3335| Binary Floating-Point Arithmetic.
3336|
3337| Uses the following identities:
3338|
3339| 1. -------------------------------------------------------------------------
3340| x x*ln(2)
3341| 2 = e
3342|
3343| 2. -------------------------------------------------------------------------
3344| 2 3 4 5 n
3345| x x x x x x x
3346| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3347| 1! 2! 3! 4! 5! n!
3348*----------------------------------------------------------------------------*/
3349
3350static const float64 float32_exp2_coefficients[15] =
3351{
d5138cf4
PM
3352 const_float64( 0x3ff0000000000000ll ), /* 1 */
3353 const_float64( 0x3fe0000000000000ll ), /* 2 */
3354 const_float64( 0x3fc5555555555555ll ), /* 3 */
3355 const_float64( 0x3fa5555555555555ll ), /* 4 */
3356 const_float64( 0x3f81111111111111ll ), /* 5 */
3357 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3358 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3359 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3360 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3361 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3362 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3363 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3364 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3365 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3366 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
3367};
3368
e5a41ffa 3369float32 float32_exp2(float32 a, float_status *status)
8229c991
AJ
3370{
3371 flag aSign;
0c48262d 3372 int aExp;
bb98fe42 3373 uint32_t aSig;
8229c991
AJ
3374 float64 r, x, xn;
3375 int i;
ff32e16e 3376 a = float32_squash_input_denormal(a, status);
8229c991
AJ
3377
3378 aSig = extractFloat32Frac( a );
3379 aExp = extractFloat32Exp( a );
3380 aSign = extractFloat32Sign( a );
3381
3382 if ( aExp == 0xFF) {
ff32e16e
PM
3383 if (aSig) {
3384 return propagateFloat32NaN(a, float32_zero, status);
3385 }
8229c991
AJ
3386 return (aSign) ? float32_zero : a;
3387 }
3388 if (aExp == 0) {
3389 if (aSig == 0) return float32_one;
3390 }
3391
ff32e16e 3392 float_raise(float_flag_inexact, status);
8229c991
AJ
3393
3394 /* ******************************* */
3395 /* using float64 for approximation */
3396 /* ******************************* */
ff32e16e
PM
3397 x = float32_to_float64(a, status);
3398 x = float64_mul(x, float64_ln2, status);
8229c991
AJ
3399
3400 xn = x;
3401 r = float64_one;
3402 for (i = 0 ; i < 15 ; i++) {
3403 float64 f;
3404
ff32e16e
PM
3405 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3406 r = float64_add(r, f, status);
8229c991 3407
ff32e16e 3408 xn = float64_mul(xn, x, status);
8229c991
AJ
3409 }
3410
3411 return float64_to_float32(r, status);
3412}
3413
374dfc33
AJ
3414/*----------------------------------------------------------------------------
3415| Returns the binary log of the single-precision floating-point value `a'.
3416| The operation is performed according to the IEC/IEEE Standard for Binary
3417| Floating-Point Arithmetic.
3418*----------------------------------------------------------------------------*/
e5a41ffa 3419float32 float32_log2(float32 a, float_status *status)
374dfc33
AJ
3420{
3421 flag aSign, zSign;
0c48262d 3422 int aExp;
bb98fe42 3423 uint32_t aSig, zSig, i;
374dfc33 3424
ff32e16e 3425 a = float32_squash_input_denormal(a, status);
374dfc33
AJ
3426 aSig = extractFloat32Frac( a );
3427 aExp = extractFloat32Exp( a );
3428 aSign = extractFloat32Sign( a );
3429
3430 if ( aExp == 0 ) {
3431 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3432 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3433 }
3434 if ( aSign ) {
ff32e16e 3435 float_raise(float_flag_invalid, status);
af39bc8c 3436 return float32_default_nan(status);
374dfc33
AJ
3437 }
3438 if ( aExp == 0xFF ) {
ff32e16e
PM
3439 if (aSig) {
3440 return propagateFloat32NaN(a, float32_zero, status);
3441 }
374dfc33
AJ
3442 return a;
3443 }
3444
3445 aExp -= 0x7F;
3446 aSig |= 0x00800000;
3447 zSign = aExp < 0;
3448 zSig = aExp << 23;
3449
3450 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 3451 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
3452 if ( aSig & 0x01000000 ) {
3453 aSig >>= 1;
3454 zSig |= i;
3455 }
3456 }
3457
3458 if ( zSign )
3459 zSig = -zSig;
3460
ff32e16e 3461 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
374dfc33
AJ
3462}
3463
158142c2
FB
3464/*----------------------------------------------------------------------------
3465| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
3466| the corresponding value `b', and 0 otherwise. The invalid exception is
3467| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
3468| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3469*----------------------------------------------------------------------------*/
3470
e5a41ffa 3471int float32_eq(float32 a, float32 b, float_status *status)
158142c2 3472{
b689362d 3473 uint32_t av, bv;
ff32e16e
PM
3474 a = float32_squash_input_denormal(a, status);
3475 b = float32_squash_input_denormal(b, status);
158142c2
FB
3476
3477 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3478 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3479 ) {
ff32e16e 3480 float_raise(float_flag_invalid, status);
158142c2
FB
3481 return 0;
3482 }
b689362d
AJ
3483 av = float32_val(a);
3484 bv = float32_val(b);
3485 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
3486}
3487
3488/*----------------------------------------------------------------------------
3489| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3490| or equal to the corresponding value `b', and 0 otherwise. The invalid
3491| exception is raised if either operand is a NaN. The comparison is performed
3492| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3493*----------------------------------------------------------------------------*/
3494
e5a41ffa 3495int float32_le(float32 a, float32 b, float_status *status)
158142c2
FB
3496{
3497 flag aSign, bSign;
bb98fe42 3498 uint32_t av, bv;
ff32e16e
PM
3499 a = float32_squash_input_denormal(a, status);
3500 b = float32_squash_input_denormal(b, status);
158142c2
FB
3501
3502 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3503 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3504 ) {
ff32e16e 3505 float_raise(float_flag_invalid, status);
158142c2
FB
3506 return 0;
3507 }
3508 aSign = extractFloat32Sign( a );
3509 bSign = extractFloat32Sign( b );
f090c9d4
PB
3510 av = float32_val(a);
3511 bv = float32_val(b);
bb98fe42 3512 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3513 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3514
3515}
3516
3517/*----------------------------------------------------------------------------
3518| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
3519| the corresponding value `b', and 0 otherwise. The invalid exception is
3520| raised if either operand is a NaN. The comparison is performed according
3521| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3522*----------------------------------------------------------------------------*/
3523
e5a41ffa 3524int float32_lt(float32 a, float32 b, float_status *status)
158142c2
FB
3525{
3526 flag aSign, bSign;
bb98fe42 3527 uint32_t av, bv;
ff32e16e
PM
3528 a = float32_squash_input_denormal(a, status);
3529 b = float32_squash_input_denormal(b, status);
158142c2
FB
3530
3531 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3532 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3533 ) {
ff32e16e 3534 float_raise(float_flag_invalid, status);
158142c2
FB
3535 return 0;
3536 }
3537 aSign = extractFloat32Sign( a );
3538 bSign = extractFloat32Sign( b );
f090c9d4
PB
3539 av = float32_val(a);
3540 bv = float32_val(b);
bb98fe42 3541 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 3542 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3543
3544}
3545
67b7861d
AJ
3546/*----------------------------------------------------------------------------
3547| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
3548| be compared, and 0 otherwise. The invalid exception is raised if either
3549| operand is a NaN. The comparison is performed according to the IEC/IEEE
3550| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
3551*----------------------------------------------------------------------------*/
3552
e5a41ffa 3553int float32_unordered(float32 a, float32 b, float_status *status)
67b7861d 3554{
ff32e16e
PM
3555 a = float32_squash_input_denormal(a, status);
3556 b = float32_squash_input_denormal(b, status);
67b7861d
AJ
3557
3558 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3559 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3560 ) {
ff32e16e 3561 float_raise(float_flag_invalid, status);
67b7861d
AJ
3562 return 1;
3563 }
3564 return 0;
3565}
b689362d 3566
158142c2
FB
3567/*----------------------------------------------------------------------------
3568| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
3569| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3570| exception. The comparison is performed according to the IEC/IEEE Standard
3571| for Binary Floating-Point Arithmetic.
158142c2
FB
3572*----------------------------------------------------------------------------*/
3573
e5a41ffa 3574int float32_eq_quiet(float32 a, float32 b, float_status *status)
158142c2 3575{
ff32e16e
PM
3576 a = float32_squash_input_denormal(a, status);
3577 b = float32_squash_input_denormal(b, status);
158142c2
FB
3578
3579 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3580 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3581 ) {
af39bc8c
AM
3582 if (float32_is_signaling_nan(a, status)
3583 || float32_is_signaling_nan(b, status)) {
ff32e16e 3584 float_raise(float_flag_invalid, status);
b689362d 3585 }
158142c2
FB
3586 return 0;
3587 }
b689362d
AJ
3588 return ( float32_val(a) == float32_val(b) ) ||
3589 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
3590}
3591
3592/*----------------------------------------------------------------------------
3593| Returns 1 if the single-precision floating-point value `a' is less than or
3594| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3595| cause an exception. Otherwise, the comparison is performed according to the
3596| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3597*----------------------------------------------------------------------------*/
3598
e5a41ffa 3599int float32_le_quiet(float32 a, float32 b, float_status *status)
158142c2
FB
3600{
3601 flag aSign, bSign;
bb98fe42 3602 uint32_t av, bv;
ff32e16e
PM
3603 a = float32_squash_input_denormal(a, status);
3604 b = float32_squash_input_denormal(b, status);
158142c2
FB
3605
3606 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3607 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3608 ) {
af39bc8c
AM
3609 if (float32_is_signaling_nan(a, status)
3610 || float32_is_signaling_nan(b, status)) {
ff32e16e 3611 float_raise(float_flag_invalid, status);
158142c2
FB
3612 }
3613 return 0;
3614 }
3615 aSign = extractFloat32Sign( a );
3616 bSign = extractFloat32Sign( b );
f090c9d4
PB
3617 av = float32_val(a);
3618 bv = float32_val(b);
bb98fe42 3619 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 3620 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
3621
3622}
3623
3624/*----------------------------------------------------------------------------
3625| Returns 1 if the single-precision floating-point value `a' is less than
3626| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3627| exception. Otherwise, the comparison is performed according to the IEC/IEEE
ab52f973 3628| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
3629*----------------------------------------------------------------------------*/
3630
ab52f973 3631int float32_lt_quiet(float32 a, float32 b, float_status *status)
158142c2 3632{
ab52f973
AB
3633 flag aSign, bSign;
3634 uint32_t av, bv;
3635 a = float32_squash_input_denormal(a, status);
3636 b = float32_squash_input_denormal(b, status);
158142c2 3637
ab52f973
AB
3638 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3639 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3640 ) {
3641 if (float32_is_signaling_nan(a, status)
3642 || float32_is_signaling_nan(b, status)) {
ff32e16e 3643 float_raise(float_flag_invalid, status);
158142c2 3644 }
ab52f973 3645 return 0;
158142c2 3646 }
ab52f973
AB
3647 aSign = extractFloat32Sign( a );
3648 bSign = extractFloat32Sign( b );
3649 av = float32_val(a);
3650 bv = float32_val(b);
3651 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3652 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
3653
3654}
3655
3656/*----------------------------------------------------------------------------
ab52f973
AB
3657| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3658| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3659| comparison is performed according to the IEC/IEEE Standard for Binary
3660| Floating-Point Arithmetic.
158142c2
FB
3661*----------------------------------------------------------------------------*/
3662
ab52f973 3663int float32_unordered_quiet(float32 a, float32 b, float_status *status)
158142c2 3664{
ab52f973
AB
3665 a = float32_squash_input_denormal(a, status);
3666 b = float32_squash_input_denormal(b, status);
158142c2 3667
ab52f973
AB
3668 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3669 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3670 ) {
3671 if (float32_is_signaling_nan(a, status)
3672 || float32_is_signaling_nan(b, status)) {
3673 float_raise(float_flag_invalid, status);
158142c2 3674 }
ab52f973 3675 return 1;
158142c2 3676 }
ab52f973 3677 return 0;
158142c2
FB
3678}
3679
ab52f973 3680
158142c2
FB
3681/*----------------------------------------------------------------------------
3682| Returns the result of converting the double-precision floating-point value
3683| `a' to the single-precision floating-point format. The conversion is
3684| performed according to the IEC/IEEE Standard for Binary Floating-Point
3685| Arithmetic.
3686*----------------------------------------------------------------------------*/
3687
e5a41ffa 3688float32 float64_to_float32(float64 a, float_status *status)
158142c2
FB
3689{
3690 flag aSign;
0c48262d 3691 int aExp;
bb98fe42
AF
3692 uint64_t aSig;
3693 uint32_t zSig;
ff32e16e 3694 a = float64_squash_input_denormal(a, status);
158142c2
FB
3695
3696 aSig = extractFloat64Frac( a );
3697 aExp = extractFloat64Exp( a );
3698 aSign = extractFloat64Sign( a );
3699 if ( aExp == 0x7FF ) {
ff32e16e
PM
3700 if (aSig) {
3701 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3702 }
158142c2
FB
3703 return packFloat32( aSign, 0xFF, 0 );
3704 }
3705 shift64RightJamming( aSig, 22, &aSig );
3706 zSig = aSig;
3707 if ( aExp || zSig ) {
3708 zSig |= 0x40000000;
3709 aExp -= 0x381;
3710 }
ff32e16e 3711 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
3712
3713}
3714
60011498
PB
3715
3716/*----------------------------------------------------------------------------
3717| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3718| half-precision floating-point value, returning the result. After being
3719| shifted into the proper positions, the three fields are simply added
3720| together to form the result. This means that any integer portion of `zSig'
3721| will be added into the exponent. Since a properly normalized significand
3722| will have an integer portion equal to 1, the `zExp' input should be 1 less
3723| than the desired result exponent whenever `zSig' is a complete, normalized
3724| significand.
3725*----------------------------------------------------------------------------*/
0c48262d 3726static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
60011498 3727{
bb4d4bb3 3728 return make_float16(
bb98fe42 3729 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3730}
3731
c4a1c5e7
PM
3732/*----------------------------------------------------------------------------
3733| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3734| and significand `zSig', and returns the proper half-precision floating-
3735| point value corresponding to the abstract input. Ordinarily, the abstract
3736| value is simply rounded and packed into the half-precision format, with
3737| the inexact exception raised if the abstract input cannot be represented
3738| exactly. However, if the abstract value is too large, the overflow and
3739| inexact exceptions are raised and an infinity or maximal finite value is
3740| returned. If the abstract value is too small, the input value is rounded to
3741| a subnormal number, and the underflow and inexact exceptions are raised if
3742| the abstract input cannot be represented exactly as a subnormal half-
3743| precision floating-point number.
3744| The `ieee' flag indicates whether to use IEEE standard half precision, or
3745| ARM-style "alternative representation", which omits the NaN and Inf
3746| encodings in order to raise the maximum representable exponent by one.
3747| The input significand `zSig' has its binary point between bits 22
3748| and 23, which is 13 bits to the left of the usual location. This shifted
3749| significand must be normalized or smaller. If `zSig' is not normalized,
3750| `zExp' must be 0; in that case, the result returned is a subnormal number,
3751| and it must not require rounding. In the usual case that `zSig' is
3752| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3753| Note the slightly odd position of the binary point in zSig compared with the
3754| other roundAndPackFloat functions. This should probably be fixed if we
3755| need to implement more float16 routines than just conversion.
3756| The handling of underflow and overflow follows the IEC/IEEE Standard for
3757| Binary Floating-Point Arithmetic.
3758*----------------------------------------------------------------------------*/
3759
0c48262d 3760static float16 roundAndPackFloat16(flag zSign, int zExp,
e5a41ffa
PM
3761 uint32_t zSig, flag ieee,
3762 float_status *status)
c4a1c5e7
PM
3763{
3764 int maxexp = ieee ? 29 : 30;
3765 uint32_t mask;
3766 uint32_t increment;
c4a1c5e7
PM
3767 bool rounding_bumps_exp;
3768 bool is_tiny = false;
3769
3770 /* Calculate the mask of bits of the mantissa which are not
3771 * representable in half-precision and will be lost.
3772 */
3773 if (zExp < 1) {
3774 /* Will be denormal in halfprec */
3775 mask = 0x00ffffff;
3776 if (zExp >= -11) {
3777 mask >>= 11 + zExp;
3778 }
3779 } else {
3780 /* Normal number in halfprec */
3781 mask = 0x00001fff;
3782 }
3783
a2f2d288 3784 switch (status->float_rounding_mode) {
c4a1c5e7
PM
3785 case float_round_nearest_even:
3786 increment = (mask + 1) >> 1;
3787 if ((zSig & mask) == increment) {
3788 increment = zSig & (increment << 1);
3789 }
3790 break;
f9288a76
PM
3791 case float_round_ties_away:
3792 increment = (mask + 1) >> 1;
3793 break;
c4a1c5e7
PM
3794 case float_round_up:
3795 increment = zSign ? 0 : mask;
3796 break;
3797 case float_round_down:
3798 increment = zSign ? mask : 0;
3799 break;
3800 default: /* round_to_zero */
3801 increment = 0;
3802 break;
3803 }
3804
3805 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3806
3807 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3808 if (ieee) {
ff32e16e 3809 float_raise(float_flag_overflow | float_flag_inexact, status);
c4a1c5e7
PM
3810 return packFloat16(zSign, 0x1f, 0);
3811 } else {
ff32e16e 3812 float_raise(float_flag_invalid, status);
c4a1c5e7
PM
3813 return packFloat16(zSign, 0x1f, 0x3ff);
3814 }
3815 }
3816
3817 if (zExp < 0) {
3818 /* Note that flush-to-zero does not affect half-precision results */
3819 is_tiny =
a2f2d288 3820 (status->float_detect_tininess == float_tininess_before_rounding)
c4a1c5e7
PM
3821 || (zExp < -1)
3822 || (!rounding_bumps_exp);
3823 }
3824 if (zSig & mask) {
ff32e16e 3825 float_raise(float_flag_inexact, status);
c4a1c5e7 3826 if (is_tiny) {
ff32e16e 3827 float_raise(float_flag_underflow, status);
c4a1c5e7
PM
3828 }
3829 }
3830
3831 zSig += increment;
3832 if (rounding_bumps_exp) {
3833 zSig >>= 1;
3834 zExp++;
3835 }
3836
3837 if (zExp < -10) {
3838 return packFloat16(zSign, 0, 0);
3839 }
3840 if (zExp < 0) {
3841 zSig >>= -zExp;
3842 zExp = 0;
3843 }
3844 return packFloat16(zSign, zExp, zSig >> 13);
3845}
3846
210cbd49
AB
3847/*----------------------------------------------------------------------------
3848| If `a' is denormal and we are in flush-to-zero mode then set the
3849| input-denormal exception and return zero. Otherwise just return the value.
3850*----------------------------------------------------------------------------*/
3851float16 float16_squash_input_denormal(float16 a, float_status *status)
3852{
3853 if (status->flush_inputs_to_zero) {
3854 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3855 float_raise(float_flag_input_denormal, status);
3856 return make_float16(float16_val(a) & 0x8000);
3857 }
3858 }
3859 return a;
3860}
3861
0c48262d 3862static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
c4a1c5e7
PM
3863 uint32_t *zSigPtr)
3864{
3865 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3866 *zSigPtr = aSig << shiftCount;
3867 *zExpPtr = 1 - shiftCount;
3868}
3869
60011498
PB
3870/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3871 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3 3872
e5a41ffa 3873float32 float16_to_float32(float16 a, flag ieee, float_status *status)
60011498
PB
3874{
3875 flag aSign;
0c48262d 3876 int aExp;
bb98fe42 3877 uint32_t aSig;
60011498 3878
bb4d4bb3
PM
3879 aSign = extractFloat16Sign(a);
3880 aExp = extractFloat16Exp(a);
3881 aSig = extractFloat16Frac(a);
60011498
PB
3882
3883 if (aExp == 0x1f && ieee) {
3884 if (aSig) {
ff32e16e 3885 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
60011498 3886 }
4be8eeac 3887 return packFloat32(aSign, 0xff, 0);
60011498
PB
3888 }
3889 if (aExp == 0) {
60011498
PB
3890 if (aSig == 0) {
3891 return packFloat32(aSign, 0, 0);
3892 }
3893
c4a1c5e7
PM
3894 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3895 aExp--;
60011498
PB
3896 }
3897 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3898}
3899
e5a41ffa 3900float16 float32_to_float16(float32 a, flag ieee, float_status *status)
60011498
PB
3901{
3902 flag aSign;
0c48262d 3903 int aExp;
bb98fe42 3904 uint32_t aSig;
38970efa 3905
ff32e16e 3906 a = float32_squash_input_denormal(a, status);
60011498
PB
3907
3908 aSig = extractFloat32Frac( a );
3909 aExp = extractFloat32Exp( a );
3910 aSign = extractFloat32Sign( a );
3911 if ( aExp == 0xFF ) {
3912 if (aSig) {
600e30d2 3913 /* Input is a NaN */
600e30d2 3914 if (!ieee) {
ff32e16e 3915 float_raise(float_flag_invalid, status);
600e30d2
PM
3916 return packFloat16(aSign, 0, 0);
3917 }
38970efa 3918 return commonNaNToFloat16(
ff32e16e 3919 float32ToCommonNaN(a, status), status);
60011498 3920 }
600e30d2
PM
3921 /* Infinity */
3922 if (!ieee) {
ff32e16e 3923 float_raise(float_flag_invalid, status);
600e30d2
PM
3924 return packFloat16(aSign, 0x1f, 0x3ff);
3925 }
3926 return packFloat16(aSign, 0x1f, 0);
60011498 3927 }
600e30d2 3928 if (aExp == 0 && aSig == 0) {
60011498
PB
3929 return packFloat16(aSign, 0, 0);
3930 }
38970efa
PM
3931 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3932 * even if the input is denormal; however this is harmless because
3933 * the largest possible single-precision denormal is still smaller
3934 * than the smallest representable half-precision denormal, and so we
3935 * will end up ignoring aSig and returning via the "always return zero"
3936 * codepath.
3937 */
60011498 3938 aSig |= 0x00800000;
c4a1c5e7 3939 aExp -= 0x71;
60011498 3940
ff32e16e 3941 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
60011498
PB
3942}
3943
e5a41ffa 3944float64 float16_to_float64(float16 a, flag ieee, float_status *status)
14c9a07e
PM
3945{
3946 flag aSign;
0c48262d 3947 int aExp;
14c9a07e
PM
3948 uint32_t aSig;
3949
3950 aSign = extractFloat16Sign(a);
3951 aExp = extractFloat16Exp(a);
3952 aSig = extractFloat16Frac(a);
3953
3954 if (aExp == 0x1f && ieee) {
3955 if (aSig) {
3956 return commonNaNToFloat64(
ff32e16e 3957 float16ToCommonNaN(a, status), status);
14c9a07e
PM
3958 }
3959 return packFloat64(aSign, 0x7ff, 0);
3960 }
3961 if (aExp == 0) {
3962 if (aSig == 0) {
3963 return packFloat64(aSign, 0, 0);
3964 }
3965
3966 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3967 aExp--;
3968 }
3969 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3970}
3971
e5a41ffa 3972float16 float64_to_float16(float64 a, flag ieee, float_status *status)
14c9a07e
PM
3973{
3974 flag aSign;
0c48262d 3975 int aExp;
14c9a07e
PM
3976 uint64_t aSig;
3977 uint32_t zSig;
3978
ff32e16e 3979 a = float64_squash_input_denormal(a, status);
14c9a07e
PM
3980
3981 aSig = extractFloat64Frac(a);
3982 aExp = extractFloat64Exp(a);
3983 aSign = extractFloat64Sign(a);
3984 if (aExp == 0x7FF) {
3985 if (aSig) {
3986 /* Input is a NaN */
3987 if (!ieee) {
ff32e16e 3988 float_raise(float_flag_invalid, status);
14c9a07e
PM
3989 return packFloat16(aSign, 0, 0);
3990 }
3991 return commonNaNToFloat16(
ff32e16e 3992 float64ToCommonNaN(a, status), status);
14c9a07e
PM
3993 }
3994 /* Infinity */
3995 if (!ieee) {
ff32e16e 3996 float_raise(float_flag_invalid, status);
14c9a07e
PM
3997 return packFloat16(aSign, 0x1f, 0x3ff);
3998 }
3999 return packFloat16(aSign, 0x1f, 0);
4000 }
4001 shift64RightJamming(aSig, 29, &aSig);
4002 zSig = aSig;
4003 if (aExp == 0 && zSig == 0) {
4004 return packFloat16(aSign, 0, 0);
4005 }
4006 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4007 * even if the input is denormal; however this is harmless because
4008 * the largest possible single-precision denormal is still smaller
4009 * than the smallest representable half-precision denormal, and so we
4010 * will end up ignoring aSig and returning via the "always return zero"
4011 * codepath.
4012 */
4013 zSig |= 0x00800000;
4014 aExp -= 0x3F1;
4015
ff32e16e 4016 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
14c9a07e
PM
4017}
4018
158142c2
FB
4019/*----------------------------------------------------------------------------
4020| Returns the result of converting the double-precision floating-point value
4021| `a' to the extended double-precision floating-point format. The conversion
4022| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4023| Arithmetic.
4024*----------------------------------------------------------------------------*/
4025
e5a41ffa 4026floatx80 float64_to_floatx80(float64 a, float_status *status)
158142c2
FB
4027{
4028 flag aSign;
0c48262d 4029 int aExp;
bb98fe42 4030 uint64_t aSig;
158142c2 4031
ff32e16e 4032 a = float64_squash_input_denormal(a, status);
158142c2
FB
4033 aSig = extractFloat64Frac( a );
4034 aExp = extractFloat64Exp( a );
4035 aSign = extractFloat64Sign( a );
4036 if ( aExp == 0x7FF ) {
ff32e16e
PM
4037 if (aSig) {
4038 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4039 }
0f605c88
LV
4040 return packFloatx80(aSign,
4041 floatx80_infinity_high,
4042 floatx80_infinity_low);
158142c2
FB
4043 }
4044 if ( aExp == 0 ) {
4045 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4046 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4047 }
4048 return
4049 packFloatx80(
4050 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4051
4052}
4053
158142c2
FB
4054/*----------------------------------------------------------------------------
4055| Returns the result of converting the double-precision floating-point value
4056| `a' to the quadruple-precision floating-point format. The conversion is
4057| performed according to the IEC/IEEE Standard for Binary Floating-Point
4058| Arithmetic.
4059*----------------------------------------------------------------------------*/
4060
e5a41ffa 4061float128 float64_to_float128(float64 a, float_status *status)
158142c2
FB
4062{
4063 flag aSign;
0c48262d 4064 int aExp;
bb98fe42 4065 uint64_t aSig, zSig0, zSig1;
158142c2 4066
ff32e16e 4067 a = float64_squash_input_denormal(a, status);
158142c2
FB
4068 aSig = extractFloat64Frac( a );
4069 aExp = extractFloat64Exp( a );
4070 aSign = extractFloat64Sign( a );
4071 if ( aExp == 0x7FF ) {
ff32e16e
PM
4072 if (aSig) {
4073 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4074 }
158142c2
FB
4075 return packFloat128( aSign, 0x7FFF, 0, 0 );
4076 }
4077 if ( aExp == 0 ) {
4078 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4079 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4080 --aExp;
4081 }
4082 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4083 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4084
4085}
4086
158142c2
FB
4087
4088/*----------------------------------------------------------------------------
4089| Returns the remainder of the double-precision floating-point value `a'
4090| with respect to the corresponding value `b'. The operation is performed
4091| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4092*----------------------------------------------------------------------------*/
4093
e5a41ffa 4094float64 float64_rem(float64 a, float64 b, float_status *status)
158142c2 4095{
ed086f3d 4096 flag aSign, zSign;
0c48262d 4097 int aExp, bExp, expDiff;
bb98fe42
AF
4098 uint64_t aSig, bSig;
4099 uint64_t q, alternateASig;
4100 int64_t sigMean;
158142c2 4101
ff32e16e
PM
4102 a = float64_squash_input_denormal(a, status);
4103 b = float64_squash_input_denormal(b, status);
158142c2
FB
4104 aSig = extractFloat64Frac( a );
4105 aExp = extractFloat64Exp( a );
4106 aSign = extractFloat64Sign( a );
4107 bSig = extractFloat64Frac( b );
4108 bExp = extractFloat64Exp( b );
158142c2
FB
4109 if ( aExp == 0x7FF ) {
4110 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
ff32e16e 4111 return propagateFloat64NaN(a, b, status);
158142c2 4112 }
ff32e16e 4113 float_raise(float_flag_invalid, status);
af39bc8c 4114 return float64_default_nan(status);
158142c2
FB
4115 }
4116 if ( bExp == 0x7FF ) {
ff32e16e
PM
4117 if (bSig) {
4118 return propagateFloat64NaN(a, b, status);
4119 }
158142c2
FB
4120 return a;
4121 }
4122 if ( bExp == 0 ) {
4123 if ( bSig == 0 ) {
ff32e16e 4124 float_raise(float_flag_invalid, status);
af39bc8c 4125 return float64_default_nan(status);
158142c2
FB
4126 }
4127 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4128 }
4129 if ( aExp == 0 ) {
4130 if ( aSig == 0 ) return a;
4131 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4132 }
4133 expDiff = aExp - bExp;
4134 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4135 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4136 if ( expDiff < 0 ) {
4137 if ( expDiff < -1 ) return a;
4138 aSig >>= 1;
4139 }
4140 q = ( bSig <= aSig );
4141 if ( q ) aSig -= bSig;
4142 expDiff -= 64;
4143 while ( 0 < expDiff ) {
4144 q = estimateDiv128To64( aSig, 0, bSig );
4145 q = ( 2 < q ) ? q - 2 : 0;
4146 aSig = - ( ( bSig>>2 ) * q );
4147 expDiff -= 62;
4148 }
4149 expDiff += 64;
4150 if ( 0 < expDiff ) {
4151 q = estimateDiv128To64( aSig, 0, bSig );
4152 q = ( 2 < q ) ? q - 2 : 0;
4153 q >>= 64 - expDiff;
4154 bSig >>= 2;
4155 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4156 }
4157 else {
4158 aSig >>= 2;
4159 bSig >>= 2;
4160 }
4161 do {
4162 alternateASig = aSig;
4163 ++q;
4164 aSig -= bSig;
bb98fe42 4165 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4166 sigMean = aSig + alternateASig;
4167 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4168 aSig = alternateASig;
4169 }
bb98fe42 4170 zSign = ( (int64_t) aSig < 0 );
158142c2 4171 if ( zSign ) aSig = - aSig;
ff32e16e 4172 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
158142c2
FB
4173
4174}
4175
374dfc33
AJ
4176/*----------------------------------------------------------------------------
4177| Returns the binary log of the double-precision floating-point value `a'.
4178| The operation is performed according to the IEC/IEEE Standard for Binary
4179| Floating-Point Arithmetic.
4180*----------------------------------------------------------------------------*/
e5a41ffa 4181float64 float64_log2(float64 a, float_status *status)
374dfc33
AJ
4182{
4183 flag aSign, zSign;
0c48262d 4184 int aExp;
bb98fe42 4185 uint64_t aSig, aSig0, aSig1, zSig, i;
ff32e16e 4186 a = float64_squash_input_denormal(a, status);
374dfc33
AJ
4187
4188 aSig = extractFloat64Frac( a );
4189 aExp = extractFloat64Exp( a );
4190 aSign = extractFloat64Sign( a );
4191
4192 if ( aExp == 0 ) {
4193 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4194 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4195 }
4196 if ( aSign ) {
ff32e16e 4197 float_raise(float_flag_invalid, status);
af39bc8c 4198 return float64_default_nan(status);
374dfc33
AJ
4199 }
4200 if ( aExp == 0x7FF ) {
ff32e16e
PM
4201 if (aSig) {
4202 return propagateFloat64NaN(a, float64_zero, status);
4203 }
374dfc33
AJ
4204 return a;
4205 }
4206
4207 aExp -= 0x3FF;
4208 aSig |= LIT64( 0x0010000000000000 );
4209 zSign = aExp < 0;
bb98fe42 4210 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4211 for (i = 1LL << 51; i > 0; i >>= 1) {
4212 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4213 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4214 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4215 aSig >>= 1;
4216 zSig |= i;
4217 }
4218 }
4219
4220 if ( zSign )
4221 zSig = -zSig;
ff32e16e 4222 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
374dfc33
AJ
4223}
4224
158142c2
FB
4225/*----------------------------------------------------------------------------
4226| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4227| corresponding value `b', and 0 otherwise. The invalid exception is raised
4228| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4229| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4230*----------------------------------------------------------------------------*/
4231
e5a41ffa 4232int float64_eq(float64 a, float64 b, float_status *status)
158142c2 4233{
bb98fe42 4234 uint64_t av, bv;
ff32e16e
PM
4235 a = float64_squash_input_denormal(a, status);
4236 b = float64_squash_input_denormal(b, status);
158142c2
FB
4237
4238 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4239 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4240 ) {
ff32e16e 4241 float_raise(float_flag_invalid, status);
158142c2
FB
4242 return 0;
4243 }
f090c9d4 4244 av = float64_val(a);
a1b91bb4 4245 bv = float64_val(b);
bb98fe42 4246 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4247
4248}
4249
4250/*----------------------------------------------------------------------------
4251| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4252| equal to the corresponding value `b', and 0 otherwise. The invalid
4253| exception is raised if either operand is a NaN. The comparison is performed
4254| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4255*----------------------------------------------------------------------------*/
4256
e5a41ffa 4257int float64_le(float64 a, float64 b, float_status *status)
158142c2
FB
4258{
4259 flag aSign, bSign;
bb98fe42 4260 uint64_t av, bv;
ff32e16e
PM
4261 a = float64_squash_input_denormal(a, status);
4262 b = float64_squash_input_denormal(b, status);
158142c2
FB
4263
4264 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4265 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4266 ) {
ff32e16e 4267 float_raise(float_flag_invalid, status);
158142c2
FB
4268 return 0;
4269 }
4270 aSign = extractFloat64Sign( a );
4271 bSign = extractFloat64Sign( b );
f090c9d4 4272 av = float64_val(a);
a1b91bb4 4273 bv = float64_val(b);
bb98fe42 4274 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4275 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4276
4277}
4278
4279/*----------------------------------------------------------------------------
4280| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4281| the corresponding value `b', and 0 otherwise. The invalid exception is
4282| raised if either operand is a NaN. The comparison is performed according
4283| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4284*----------------------------------------------------------------------------*/
4285
e5a41ffa 4286int float64_lt(float64 a, float64 b, float_status *status)
158142c2
FB
4287{
4288 flag aSign, bSign;
bb98fe42 4289 uint64_t av, bv;
158142c2 4290
ff32e16e
PM
4291 a = float64_squash_input_denormal(a, status);
4292 b = float64_squash_input_denormal(b, status);
158142c2
FB
4293 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4294 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4295 ) {
ff32e16e 4296 float_raise(float_flag_invalid, status);
158142c2
FB
4297 return 0;
4298 }
4299 aSign = extractFloat64Sign( a );
4300 bSign = extractFloat64Sign( b );
f090c9d4 4301 av = float64_val(a);
a1b91bb4 4302 bv = float64_val(b);
bb98fe42 4303 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4304 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4305
4306}
4307
67b7861d
AJ
4308/*----------------------------------------------------------------------------
4309| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4310| be compared, and 0 otherwise. The invalid exception is raised if either
4311| operand is a NaN. The comparison is performed according to the IEC/IEEE
4312| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4313*----------------------------------------------------------------------------*/
4314
e5a41ffa 4315int float64_unordered(float64 a, float64 b, float_status *status)
67b7861d 4316{
ff32e16e
PM
4317 a = float64_squash_input_denormal(a, status);
4318 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4319
4320 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4321 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4322 ) {
ff32e16e 4323 float_raise(float_flag_invalid, status);
67b7861d
AJ
4324 return 1;
4325 }
4326 return 0;
4327}
4328
158142c2
FB
4329/*----------------------------------------------------------------------------
4330| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4331| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4332| exception.The comparison is performed according to the IEC/IEEE Standard
4333| for Binary Floating-Point Arithmetic.
158142c2
FB
4334*----------------------------------------------------------------------------*/
4335
e5a41ffa 4336int float64_eq_quiet(float64 a, float64 b, float_status *status)
158142c2 4337{
bb98fe42 4338 uint64_t av, bv;
ff32e16e
PM
4339 a = float64_squash_input_denormal(a, status);
4340 b = float64_squash_input_denormal(b, status);
158142c2
FB
4341
4342 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4343 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4344 ) {
af39bc8c
AM
4345 if (float64_is_signaling_nan(a, status)
4346 || float64_is_signaling_nan(b, status)) {
ff32e16e 4347 float_raise(float_flag_invalid, status);
b689362d 4348 }
158142c2
FB
4349 return 0;
4350 }
f090c9d4 4351 av = float64_val(a);
a1b91bb4 4352 bv = float64_val(b);
bb98fe42 4353 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4354
4355}
4356
4357/*----------------------------------------------------------------------------
4358| Returns 1 if the double-precision floating-point value `a' is less than or
4359| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4360| cause an exception. Otherwise, the comparison is performed according to the
4361| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4362*----------------------------------------------------------------------------*/
4363
e5a41ffa 4364int float64_le_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4365{
4366 flag aSign, bSign;
bb98fe42 4367 uint64_t av, bv;
ff32e16e
PM
4368 a = float64_squash_input_denormal(a, status);
4369 b = float64_squash_input_denormal(b, status);
158142c2
FB
4370
4371 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4372 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4373 ) {
af39bc8c
AM
4374 if (float64_is_signaling_nan(a, status)
4375 || float64_is_signaling_nan(b, status)) {
ff32e16e 4376 float_raise(float_flag_invalid, status);
158142c2
FB
4377 }
4378 return 0;
4379 }
4380 aSign = extractFloat64Sign( a );
4381 bSign = extractFloat64Sign( b );
f090c9d4 4382 av = float64_val(a);
a1b91bb4 4383 bv = float64_val(b);
bb98fe42 4384 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4385 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4386
4387}
4388
4389/*----------------------------------------------------------------------------
4390| Returns 1 if the double-precision floating-point value `a' is less than
4391| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4392| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4393| Standard for Binary Floating-Point Arithmetic.
4394*----------------------------------------------------------------------------*/
4395
e5a41ffa 4396int float64_lt_quiet(float64 a, float64 b, float_status *status)
158142c2
FB
4397{
4398 flag aSign, bSign;
bb98fe42 4399 uint64_t av, bv;
ff32e16e
PM
4400 a = float64_squash_input_denormal(a, status);
4401 b = float64_squash_input_denormal(b, status);
158142c2
FB
4402
4403 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4404 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4405 ) {
af39bc8c
AM
4406 if (float64_is_signaling_nan(a, status)
4407 || float64_is_signaling_nan(b, status)) {
ff32e16e 4408 float_raise(float_flag_invalid, status);
158142c2
FB
4409 }
4410 return 0;
4411 }
4412 aSign = extractFloat64Sign( a );
4413 bSign = extractFloat64Sign( b );
f090c9d4 4414 av = float64_val(a);
a1b91bb4 4415 bv = float64_val(b);
bb98fe42 4416 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4417 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4418
4419}
4420
67b7861d
AJ
4421/*----------------------------------------------------------------------------
4422| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4423| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4424| comparison is performed according to the IEC/IEEE Standard for Binary
4425| Floating-Point Arithmetic.
4426*----------------------------------------------------------------------------*/
4427
e5a41ffa 4428int float64_unordered_quiet(float64 a, float64 b, float_status *status)
67b7861d 4429{
ff32e16e
PM
4430 a = float64_squash_input_denormal(a, status);
4431 b = float64_squash_input_denormal(b, status);
67b7861d
AJ
4432
4433 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4434 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4435 ) {
af39bc8c
AM
4436 if (float64_is_signaling_nan(a, status)
4437 || float64_is_signaling_nan(b, status)) {
ff32e16e 4438 float_raise(float_flag_invalid, status);
67b7861d
AJ
4439 }
4440 return 1;
4441 }
4442 return 0;
4443}
4444
158142c2
FB
4445/*----------------------------------------------------------------------------
4446| Returns the result of converting the extended double-precision floating-
4447| point value `a' to the 32-bit two's complement integer format. The
4448| conversion is performed according to the IEC/IEEE Standard for Binary
4449| Floating-Point Arithmetic---which means in particular that the conversion
4450| is rounded according to the current rounding mode. If `a' is a NaN, the
4451| largest positive integer is returned. Otherwise, if the conversion
4452| overflows, the largest integer with the same sign as `a' is returned.
4453*----------------------------------------------------------------------------*/
4454
f4014512 4455int32_t floatx80_to_int32(floatx80 a, float_status *status)
158142c2
FB
4456{
4457 flag aSign;
f4014512 4458 int32_t aExp, shiftCount;
bb98fe42 4459 uint64_t aSig;
158142c2 4460
d1eb8f2a
AD
4461 if (floatx80_invalid_encoding(a)) {
4462 float_raise(float_flag_invalid, status);
4463 return 1 << 31;
4464 }
158142c2
FB
4465 aSig = extractFloatx80Frac( a );
4466 aExp = extractFloatx80Exp( a );
4467 aSign = extractFloatx80Sign( a );
bb98fe42 4468 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4469 shiftCount = 0x4037 - aExp;
4470 if ( shiftCount <= 0 ) shiftCount = 1;
4471 shift64RightJamming( aSig, shiftCount, &aSig );
ff32e16e 4472 return roundAndPackInt32(aSign, aSig, status);
158142c2
FB
4473
4474}
4475
4476/*----------------------------------------------------------------------------
4477| Returns the result of converting the extended double-precision floating-
4478| point value `a' to the 32-bit two's complement integer format. The
4479| conversion is performed according to the IEC/IEEE Standard for Binary
4480| Floating-Point Arithmetic, except that the conversion is always rounded
4481| toward zero. If `a' is a NaN, the largest positive integer is returned.
4482| Otherwise, if the conversion overflows, the largest integer with the same
4483| sign as `a' is returned.
4484*----------------------------------------------------------------------------*/
4485
f4014512 4486int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4487{
4488 flag aSign;
f4014512 4489 int32_t aExp, shiftCount;
bb98fe42 4490 uint64_t aSig, savedASig;
b3a6a2e0 4491 int32_t z;
158142c2 4492
d1eb8f2a
AD
4493 if (floatx80_invalid_encoding(a)) {
4494 float_raise(float_flag_invalid, status);
4495 return 1 << 31;
4496 }
158142c2
FB
4497 aSig = extractFloatx80Frac( a );
4498 aExp = extractFloatx80Exp( a );
4499 aSign = extractFloatx80Sign( a );
4500 if ( 0x401E < aExp ) {
bb98fe42 4501 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4502 goto invalid;
4503 }
4504 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4505 if (aExp || aSig) {
4506 status->float_exception_flags |= float_flag_inexact;
4507 }
158142c2
FB
4508 return 0;
4509 }
4510 shiftCount = 0x403E - aExp;
4511 savedASig = aSig;
4512 aSig >>= shiftCount;
4513 z = aSig;
4514 if ( aSign ) z = - z;
4515 if ( ( z < 0 ) ^ aSign ) {
4516 invalid:
ff32e16e 4517 float_raise(float_flag_invalid, status);
bb98fe42 4518 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4519 }
4520 if ( ( aSig<<shiftCount ) != savedASig ) {
a2f2d288 4521 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4522 }
4523 return z;
4524
4525}
4526
4527/*----------------------------------------------------------------------------
4528| Returns the result of converting the extended double-precision floating-
4529| point value `a' to the 64-bit two's complement integer format. The
4530| conversion is performed according to the IEC/IEEE Standard for Binary
4531| Floating-Point Arithmetic---which means in particular that the conversion
4532| is rounded according to the current rounding mode. If `a' is a NaN,
4533| the largest positive integer is returned. Otherwise, if the conversion
4534| overflows, the largest integer with the same sign as `a' is returned.
4535*----------------------------------------------------------------------------*/
4536
f42c2224 4537int64_t floatx80_to_int64(floatx80 a, float_status *status)
158142c2
FB
4538{
4539 flag aSign;
f4014512 4540 int32_t aExp, shiftCount;
bb98fe42 4541 uint64_t aSig, aSigExtra;
158142c2 4542
d1eb8f2a
AD
4543 if (floatx80_invalid_encoding(a)) {
4544 float_raise(float_flag_invalid, status);
4545 return 1ULL << 63;
4546 }
158142c2
FB
4547 aSig = extractFloatx80Frac( a );
4548 aExp = extractFloatx80Exp( a );
4549 aSign = extractFloatx80Sign( a );
4550 shiftCount = 0x403E - aExp;
4551 if ( shiftCount <= 0 ) {
4552 if ( shiftCount ) {
ff32e16e 4553 float_raise(float_flag_invalid, status);
0f605c88 4554 if (!aSign || floatx80_is_any_nan(a)) {
158142c2
FB
4555 return LIT64( 0x7FFFFFFFFFFFFFFF );
4556 }
bb98fe42 4557 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4558 }
4559 aSigExtra = 0;
4560 }
4561 else {
4562 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4563 }
ff32e16e 4564 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
158142c2
FB
4565
4566}
4567
4568/*----------------------------------------------------------------------------
4569| Returns the result of converting the extended double-precision floating-
4570| point value `a' to the 64-bit two's complement integer format. The
4571| conversion is performed according to the IEC/IEEE Standard for Binary
4572| Floating-Point Arithmetic, except that the conversion is always rounded
4573| toward zero. If `a' is a NaN, the largest positive integer is returned.
4574| Otherwise, if the conversion overflows, the largest integer with the same
4575| sign as `a' is returned.
4576*----------------------------------------------------------------------------*/
4577
f42c2224 4578int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
158142c2
FB
4579{
4580 flag aSign;
f4014512 4581 int32_t aExp, shiftCount;
bb98fe42 4582 uint64_t aSig;
f42c2224 4583 int64_t z;
158142c2 4584
d1eb8f2a
AD
4585 if (floatx80_invalid_encoding(a)) {
4586 float_raise(float_flag_invalid, status);
4587 return 1ULL << 63;
4588 }
158142c2
FB
4589 aSig = extractFloatx80Frac( a );
4590 aExp = extractFloatx80Exp( a );
4591 aSign = extractFloatx80Sign( a );
4592 shiftCount = aExp - 0x403E;
4593 if ( 0 <= shiftCount ) {
4594 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4595 if ( ( a.high != 0xC03E ) || aSig ) {
ff32e16e 4596 float_raise(float_flag_invalid, status);
158142c2
FB
4597 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4598 return LIT64( 0x7FFFFFFFFFFFFFFF );
4599 }
4600 }
bb98fe42 4601 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4602 }
4603 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
4604 if (aExp | aSig) {
4605 status->float_exception_flags |= float_flag_inexact;
4606 }
158142c2
FB
4607 return 0;
4608 }
4609 z = aSig>>( - shiftCount );
bb98fe42 4610 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
a2f2d288 4611 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
4612 }
4613 if ( aSign ) z = - z;
4614 return z;
4615
4616}
4617
4618/*----------------------------------------------------------------------------
4619| Returns the result of converting the extended double-precision floating-
4620| point value `a' to the single-precision floating-point format. The
4621| conversion is performed according to the IEC/IEEE Standard for Binary
4622| Floating-Point Arithmetic.
4623*----------------------------------------------------------------------------*/
4624
e5a41ffa 4625float32 floatx80_to_float32(floatx80 a, float_status *status)
158142c2
FB
4626{
4627 flag aSign;
f4014512 4628 int32_t aExp;
bb98fe42 4629 uint64_t aSig;
158142c2 4630
d1eb8f2a
AD
4631 if (floatx80_invalid_encoding(a)) {
4632 float_raise(float_flag_invalid, status);
4633 return float32_default_nan(status);
4634 }
158142c2
FB
4635 aSig = extractFloatx80Frac( a );
4636 aExp = extractFloatx80Exp( a );
4637 aSign = extractFloatx80Sign( a );
4638 if ( aExp == 0x7FFF ) {
bb98fe42 4639 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4640 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4641 }
4642 return packFloat32( aSign, 0xFF, 0 );
4643 }
4644 shift64RightJamming( aSig, 33, &aSig );
4645 if ( aExp || aSig ) aExp -= 0x3F81;
ff32e16e 4646 return roundAndPackFloat32(aSign, aExp, aSig, status);
158142c2
FB
4647
4648}
4649
4650/*----------------------------------------------------------------------------
4651| Returns the result of converting the extended double-precision floating-
4652| point value `a' to the double-precision floating-point format. The
4653| conversion is performed according to the IEC/IEEE Standard for Binary
4654| Floating-Point Arithmetic.
4655*----------------------------------------------------------------------------*/
4656
e5a41ffa 4657float64 floatx80_to_float64(floatx80 a, float_status *status)
158142c2
FB
4658{
4659 flag aSign;
f4014512 4660 int32_t aExp;
bb98fe42 4661 uint64_t aSig, zSig;
158142c2 4662
d1eb8f2a
AD
4663 if (floatx80_invalid_encoding(a)) {
4664 float_raise(float_flag_invalid, status);
4665 return float64_default_nan(status);
4666 }
158142c2
FB
4667 aSig = extractFloatx80Frac( a );
4668 aExp = extractFloatx80Exp( a );
4669 aSign = extractFloatx80Sign( a );
4670 if ( aExp == 0x7FFF ) {
bb98fe42 4671 if ( (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4672 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4673 }
4674 return packFloat64( aSign, 0x7FF, 0 );
4675 }
4676 shift64RightJamming( aSig, 1, &zSig );
4677 if ( aExp || aSig ) aExp -= 0x3C01;
ff32e16e 4678 return roundAndPackFloat64(aSign, aExp, zSig, status);
158142c2
FB
4679
4680}
4681
158142c2
FB
4682/*----------------------------------------------------------------------------
4683| Returns the result of converting the extended double-precision floating-
4684| point value `a' to the quadruple-precision floating-point format. The
4685| conversion is performed according to the IEC/IEEE Standard for Binary
4686| Floating-Point Arithmetic.
4687*----------------------------------------------------------------------------*/
4688
e5a41ffa 4689float128 floatx80_to_float128(floatx80 a, float_status *status)
158142c2
FB
4690{
4691 flag aSign;
0c48262d 4692 int aExp;
bb98fe42 4693 uint64_t aSig, zSig0, zSig1;
158142c2 4694
d1eb8f2a
AD
4695 if (floatx80_invalid_encoding(a)) {
4696 float_raise(float_flag_invalid, status);
4697 return float128_default_nan(status);
4698 }
158142c2
FB
4699 aSig = extractFloatx80Frac( a );
4700 aExp = extractFloatx80Exp( a );
4701 aSign = extractFloatx80Sign( a );
bb98fe42 4702 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
ff32e16e 4703 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
158142c2
FB
4704 }
4705 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4706 return packFloat128( aSign, aExp, zSig0, zSig1 );
4707
4708}
4709
0f721292
LV
4710/*----------------------------------------------------------------------------
4711| Rounds the extended double-precision floating-point value `a'
4712| to the precision provided by floatx80_rounding_precision and returns the
4713| result as an extended double-precision floating-point value.
4714| The operation is performed according to the IEC/IEEE Standard for Binary
4715| Floating-Point Arithmetic.
4716*----------------------------------------------------------------------------*/
4717
4718floatx80 floatx80_round(floatx80 a, float_status *status)
4719{
4720 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4721 extractFloatx80Sign(a),
4722 extractFloatx80Exp(a),
4723 extractFloatx80Frac(a), 0, status);
4724}
4725
158142c2
FB
4726/*----------------------------------------------------------------------------
4727| Rounds the extended double-precision floating-point value `a' to an integer,
4728| and returns the result as an extended quadruple-precision floating-point
4729| value. The operation is performed according to the IEC/IEEE Standard for
4730| Binary Floating-Point Arithmetic.
4731*----------------------------------------------------------------------------*/
4732
e5a41ffa 4733floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
158142c2
FB
4734{
4735 flag aSign;
f4014512 4736 int32_t aExp;
bb98fe42 4737 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4738 floatx80 z;
4739
d1eb8f2a
AD
4740 if (floatx80_invalid_encoding(a)) {
4741 float_raise(float_flag_invalid, status);
4742 return floatx80_default_nan(status);
4743 }
158142c2
FB
4744 aExp = extractFloatx80Exp( a );
4745 if ( 0x403E <= aExp ) {
bb98fe42 4746 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
ff32e16e 4747 return propagateFloatx80NaN(a, a, status);
158142c2
FB
4748 }
4749 return a;
4750 }
4751 if ( aExp < 0x3FFF ) {
4752 if ( ( aExp == 0 )
bb98fe42 4753 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4754 return a;
4755 }
a2f2d288 4756 status->float_exception_flags |= float_flag_inexact;
158142c2 4757 aSign = extractFloatx80Sign( a );
a2f2d288 4758 switch (status->float_rounding_mode) {
158142c2 4759 case float_round_nearest_even:
bb98fe42 4760 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4761 ) {
4762 return
4763 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4764 }
4765 break;
f9288a76
PM
4766 case float_round_ties_away:
4767 if (aExp == 0x3FFE) {
4768 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4769 }
4770 break;
158142c2
FB
4771 case float_round_down:
4772 return
4773 aSign ?
4774 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4775 : packFloatx80( 0, 0, 0 );
4776 case float_round_up:
4777 return
4778 aSign ? packFloatx80( 1, 0, 0 )
4779 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4780 }
4781 return packFloatx80( aSign, 0, 0 );
4782 }
4783 lastBitMask = 1;
4784 lastBitMask <<= 0x403E - aExp;
4785 roundBitsMask = lastBitMask - 1;
4786 z = a;
a2f2d288 4787 switch (status->float_rounding_mode) {
dc355b76 4788 case float_round_nearest_even:
158142c2 4789 z.low += lastBitMask>>1;
dc355b76
PM
4790 if ((z.low & roundBitsMask) == 0) {
4791 z.low &= ~lastBitMask;
4792 }
4793 break;
f9288a76
PM
4794 case float_round_ties_away:
4795 z.low += lastBitMask >> 1;
4796 break;
dc355b76
PM
4797 case float_round_to_zero:
4798 break;
4799 case float_round_up:
4800 if (!extractFloatx80Sign(z)) {
4801 z.low += roundBitsMask;
4802 }
4803 break;
4804 case float_round_down:
4805 if (extractFloatx80Sign(z)) {
158142c2
FB
4806 z.low += roundBitsMask;
4807 }
dc355b76
PM
4808 break;
4809 default:
4810 abort();
158142c2
FB
4811 }
4812 z.low &= ~ roundBitsMask;
4813 if ( z.low == 0 ) {
4814 ++z.high;
4815 z.low = LIT64( 0x8000000000000000 );
4816 }
a2f2d288
PM
4817 if (z.low != a.low) {
4818 status->float_exception_flags |= float_flag_inexact;
4819 }
158142c2
FB
4820 return z;
4821
4822}
4823
4824/*----------------------------------------------------------------------------
4825| Returns the result of adding the absolute values of the extended double-
4826| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4827| negated before being returned. `zSign' is ignored if the result is a NaN.
4828| The addition is performed according to the IEC/IEEE Standard for Binary
4829| Floating-Point Arithmetic.
4830*----------------------------------------------------------------------------*/
4831
e5a41ffa
PM
4832static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4833 float_status *status)
158142c2 4834{
f4014512 4835 int32_t aExp, bExp, zExp;
bb98fe42 4836 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 4837 int32_t expDiff;
158142c2
FB
4838
4839 aSig = extractFloatx80Frac( a );
4840 aExp = extractFloatx80Exp( a );
4841 bSig = extractFloatx80Frac( b );
4842 bExp = extractFloatx80Exp( b );
4843 expDiff = aExp - bExp;
4844 if ( 0 < expDiff ) {
4845 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4846 if ((uint64_t)(aSig << 1)) {
4847 return propagateFloatx80NaN(a, b, status);
4848 }
158142c2
FB
4849 return a;
4850 }
4851 if ( bExp == 0 ) --expDiff;
4852 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4853 zExp = aExp;
4854 }
4855 else if ( expDiff < 0 ) {
4856 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4857 if ((uint64_t)(bSig << 1)) {
4858 return propagateFloatx80NaN(a, b, status);
4859 }
0f605c88
LV
4860 return packFloatx80(zSign,
4861 floatx80_infinity_high,
4862 floatx80_infinity_low);
158142c2
FB
4863 }
4864 if ( aExp == 0 ) ++expDiff;
4865 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4866 zExp = bExp;
4867 }
4868 else {
4869 if ( aExp == 0x7FFF ) {
bb98fe42 4870 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 4871 return propagateFloatx80NaN(a, b, status);
158142c2
FB
4872 }
4873 return a;
4874 }
4875 zSig1 = 0;
4876 zSig0 = aSig + bSig;
4877 if ( aExp == 0 ) {
4878 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4879 goto roundAndPack;
4880 }
4881 zExp = aExp;
4882 goto shiftRight1;
4883 }
4884 zSig0 = aSig + bSig;
bb98fe42 4885 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
4886 shiftRight1:
4887 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4888 zSig0 |= LIT64( 0x8000000000000000 );
4889 ++zExp;
4890 roundAndPack:
a2f2d288 4891 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4892 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4893}
4894
4895/*----------------------------------------------------------------------------
4896| Returns the result of subtracting the absolute values of the extended
4897| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4898| difference is negated before being returned. `zSign' is ignored if the
4899| result is a NaN. The subtraction is performed according to the IEC/IEEE
4900| Standard for Binary Floating-Point Arithmetic.
4901*----------------------------------------------------------------------------*/
4902
e5a41ffa
PM
4903static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4904 float_status *status)
158142c2 4905{
f4014512 4906 int32_t aExp, bExp, zExp;
bb98fe42 4907 uint64_t aSig, bSig, zSig0, zSig1;
f4014512 4908 int32_t expDiff;
158142c2
FB
4909
4910 aSig = extractFloatx80Frac( a );
4911 aExp = extractFloatx80Exp( a );
4912 bSig = extractFloatx80Frac( b );
4913 bExp = extractFloatx80Exp( b );
4914 expDiff = aExp - bExp;
4915 if ( 0 < expDiff ) goto aExpBigger;
4916 if ( expDiff < 0 ) goto bExpBigger;
4917 if ( aExp == 0x7FFF ) {
bb98fe42 4918 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
ff32e16e 4919 return propagateFloatx80NaN(a, b, status);
158142c2 4920 }
ff32e16e 4921 float_raise(float_flag_invalid, status);
af39bc8c 4922 return floatx80_default_nan(status);
158142c2
FB
4923 }
4924 if ( aExp == 0 ) {
4925 aExp = 1;
4926 bExp = 1;
4927 }
4928 zSig1 = 0;
4929 if ( bSig < aSig ) goto aBigger;
4930 if ( aSig < bSig ) goto bBigger;
a2f2d288 4931 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
158142c2
FB
4932 bExpBigger:
4933 if ( bExp == 0x7FFF ) {
ff32e16e
PM
4934 if ((uint64_t)(bSig << 1)) {
4935 return propagateFloatx80NaN(a, b, status);
4936 }
0f605c88
LV
4937 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4938 floatx80_infinity_low);
158142c2
FB
4939 }
4940 if ( aExp == 0 ) ++expDiff;
4941 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4942 bBigger:
4943 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4944 zExp = bExp;
4945 zSign ^= 1;
4946 goto normalizeRoundAndPack;
4947 aExpBigger:
4948 if ( aExp == 0x7FFF ) {
ff32e16e
PM
4949 if ((uint64_t)(aSig << 1)) {
4950 return propagateFloatx80NaN(a, b, status);
4951 }
158142c2
FB
4952 return a;
4953 }
4954 if ( bExp == 0 ) --expDiff;
4955 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4956 aBigger:
4957 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4958 zExp = aExp;
4959 normalizeRoundAndPack:
a2f2d288 4960 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 4961 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
4962}
4963
4964/*----------------------------------------------------------------------------
4965| Returns the result of adding the extended double-precision floating-point
4966| values `a' and `b'. The operation is performed according to the IEC/IEEE
4967| Standard for Binary Floating-Point Arithmetic.
4968*----------------------------------------------------------------------------*/
4969
e5a41ffa 4970floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4971{
4972 flag aSign, bSign;
4973
d1eb8f2a
AD
4974 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4975 float_raise(float_flag_invalid, status);
4976 return floatx80_default_nan(status);
4977 }
158142c2
FB
4978 aSign = extractFloatx80Sign( a );
4979 bSign = extractFloatx80Sign( b );
4980 if ( aSign == bSign ) {
ff32e16e 4981 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
4982 }
4983 else {
ff32e16e 4984 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
4985 }
4986
4987}
4988
4989/*----------------------------------------------------------------------------
4990| Returns the result of subtracting the extended double-precision floating-
4991| point values `a' and `b'. The operation is performed according to the
4992| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4993*----------------------------------------------------------------------------*/
4994
e5a41ffa 4995floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
4996{
4997 flag aSign, bSign;
4998
d1eb8f2a
AD
4999 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5000 float_raise(float_flag_invalid, status);
5001 return floatx80_default_nan(status);
5002 }
158142c2
FB
5003 aSign = extractFloatx80Sign( a );
5004 bSign = extractFloatx80Sign( b );
5005 if ( aSign == bSign ) {
ff32e16e 5006 return subFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5007 }
5008 else {
ff32e16e 5009 return addFloatx80Sigs(a, b, aSign, status);
158142c2
FB
5010 }
5011
5012}
5013
5014/*----------------------------------------------------------------------------
5015| Returns the result of multiplying the extended double-precision floating-
5016| point values `a' and `b'. The operation is performed according to the
5017| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5018*----------------------------------------------------------------------------*/
5019
e5a41ffa 5020floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5021{
5022 flag aSign, bSign, zSign;
f4014512 5023 int32_t aExp, bExp, zExp;
bb98fe42 5024 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 5025
d1eb8f2a
AD
5026 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5027 float_raise(float_flag_invalid, status);
5028 return floatx80_default_nan(status);
5029 }
158142c2
FB
5030 aSig = extractFloatx80Frac( a );
5031 aExp = extractFloatx80Exp( a );
5032 aSign = extractFloatx80Sign( a );
5033 bSig = extractFloatx80Frac( b );
5034 bExp = extractFloatx80Exp( b );
5035 bSign = extractFloatx80Sign( b );
5036 zSign = aSign ^ bSign;
5037 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5038 if ( (uint64_t) ( aSig<<1 )
5039 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5040 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5041 }
5042 if ( ( bExp | bSig ) == 0 ) goto invalid;
0f605c88
LV
5043 return packFloatx80(zSign, floatx80_infinity_high,
5044 floatx80_infinity_low);
158142c2
FB
5045 }
5046 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5047 if ((uint64_t)(bSig << 1)) {
5048 return propagateFloatx80NaN(a, b, status);
5049 }
158142c2
FB
5050 if ( ( aExp | aSig ) == 0 ) {
5051 invalid:
ff32e16e 5052 float_raise(float_flag_invalid, status);
af39bc8c 5053 return floatx80_default_nan(status);
158142c2 5054 }
0f605c88
LV
5055 return packFloatx80(zSign, floatx80_infinity_high,
5056 floatx80_infinity_low);
158142c2
FB
5057 }
5058 if ( aExp == 0 ) {
5059 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5060 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5061 }
5062 if ( bExp == 0 ) {
5063 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5064 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5065 }
5066 zExp = aExp + bExp - 0x3FFE;
5067 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5068 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5069 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5070 --zExp;
5071 }
a2f2d288 5072 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5073 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5074}
5075
5076/*----------------------------------------------------------------------------
5077| Returns the result of dividing the extended double-precision floating-point
5078| value `a' by the corresponding value `b'. The operation is performed
5079| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5080*----------------------------------------------------------------------------*/
5081
e5a41ffa 5082floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5083{
5084 flag aSign, bSign, zSign;
f4014512 5085 int32_t aExp, bExp, zExp;
bb98fe42
AF
5086 uint64_t aSig, bSig, zSig0, zSig1;
5087 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2 5088
d1eb8f2a
AD
5089 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5090 float_raise(float_flag_invalid, status);
5091 return floatx80_default_nan(status);
5092 }
158142c2
FB
5093 aSig = extractFloatx80Frac( a );
5094 aExp = extractFloatx80Exp( a );
5095 aSign = extractFloatx80Sign( a );
5096 bSig = extractFloatx80Frac( b );
5097 bExp = extractFloatx80Exp( b );
5098 bSign = extractFloatx80Sign( b );
5099 zSign = aSign ^ bSign;
5100 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5101 if ((uint64_t)(aSig << 1)) {
5102 return propagateFloatx80NaN(a, b, status);
5103 }
158142c2 5104 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5105 if ((uint64_t)(bSig << 1)) {
5106 return propagateFloatx80NaN(a, b, status);
5107 }
158142c2
FB
5108 goto invalid;
5109 }
0f605c88
LV
5110 return packFloatx80(zSign, floatx80_infinity_high,
5111 floatx80_infinity_low);
158142c2
FB
5112 }
5113 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5114 if ((uint64_t)(bSig << 1)) {
5115 return propagateFloatx80NaN(a, b, status);
5116 }
158142c2
FB
5117 return packFloatx80( zSign, 0, 0 );
5118 }
5119 if ( bExp == 0 ) {
5120 if ( bSig == 0 ) {
5121 if ( ( aExp | aSig ) == 0 ) {
5122 invalid:
ff32e16e 5123 float_raise(float_flag_invalid, status);
af39bc8c 5124 return floatx80_default_nan(status);
158142c2 5125 }
ff32e16e 5126 float_raise(float_flag_divbyzero, status);
0f605c88
LV
5127 return packFloatx80(zSign, floatx80_infinity_high,
5128 floatx80_infinity_low);
158142c2
FB
5129 }
5130 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5131 }
5132 if ( aExp == 0 ) {
5133 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5134 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5135 }
5136 zExp = aExp - bExp + 0x3FFE;
5137 rem1 = 0;
5138 if ( bSig <= aSig ) {
5139 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5140 ++zExp;
5141 }
5142 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5143 mul64To128( bSig, zSig0, &term0, &term1 );
5144 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5145 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5146 --zSig0;
5147 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5148 }
5149 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5150 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5151 mul64To128( bSig, zSig1, &term1, &term2 );
5152 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5153 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5154 --zSig1;
5155 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5156 }
5157 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5158 }
a2f2d288 5159 return roundAndPackFloatx80(status->floatx80_rounding_precision,
ff32e16e 5160 zSign, zExp, zSig0, zSig1, status);
158142c2
FB
5161}
5162
5163/*----------------------------------------------------------------------------
5164| Returns the remainder of the extended double-precision floating-point value
5165| `a' with respect to the corresponding value `b'. The operation is performed
5166| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5167*----------------------------------------------------------------------------*/
5168
e5a41ffa 5169floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
158142c2 5170{
ed086f3d 5171 flag aSign, zSign;
f4014512 5172 int32_t aExp, bExp, expDiff;
bb98fe42
AF
5173 uint64_t aSig0, aSig1, bSig;
5174 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2 5175
d1eb8f2a
AD
5176 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5177 float_raise(float_flag_invalid, status);
5178 return floatx80_default_nan(status);
5179 }
158142c2
FB
5180 aSig0 = extractFloatx80Frac( a );
5181 aExp = extractFloatx80Exp( a );
5182 aSign = extractFloatx80Sign( a );
5183 bSig = extractFloatx80Frac( b );
5184 bExp = extractFloatx80Exp( b );
158142c2 5185 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5186 if ( (uint64_t) ( aSig0<<1 )
5187 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
ff32e16e 5188 return propagateFloatx80NaN(a, b, status);
158142c2
FB
5189 }
5190 goto invalid;
5191 }
5192 if ( bExp == 0x7FFF ) {
ff32e16e
PM
5193 if ((uint64_t)(bSig << 1)) {
5194 return propagateFloatx80NaN(a, b, status);
5195 }
158142c2
FB
5196 return a;
5197 }
5198 if ( bExp == 0 ) {
5199 if ( bSig == 0 ) {
5200 invalid:
ff32e16e 5201 float_raise(float_flag_invalid, status);
af39bc8c 5202 return floatx80_default_nan(status);
158142c2
FB
5203 }
5204 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5205 }
5206 if ( aExp == 0 ) {
bb98fe42 5207 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5208 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5209 }
5210 bSig |= LIT64( 0x8000000000000000 );
5211 zSign = aSign;
5212 expDiff = aExp - bExp;
5213 aSig1 = 0;
5214 if ( expDiff < 0 ) {
5215 if ( expDiff < -1 ) return a;
5216 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5217 expDiff = 0;
5218 }
5219 q = ( bSig <= aSig0 );
5220 if ( q ) aSig0 -= bSig;
5221 expDiff -= 64;
5222 while ( 0 < expDiff ) {
5223 q = estimateDiv128To64( aSig0, aSig1, bSig );
5224 q = ( 2 < q ) ? q - 2 : 0;
5225 mul64To128( bSig, q, &term0, &term1 );
5226 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5227 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5228 expDiff -= 62;
5229 }
5230 expDiff += 64;
5231 if ( 0 < expDiff ) {
5232 q = estimateDiv128To64( aSig0, aSig1, bSig );
5233 q = ( 2 < q ) ? q - 2 : 0;
5234 q >>= 64 - expDiff;
5235 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5236 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5237 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5238 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5239 ++q;
5240 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5241 }
5242 }
5243 else {
5244 term1 = 0;
5245 term0 = bSig;
5246 }
5247 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5248 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5249 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5250 && ( q & 1 ) )
5251 ) {
5252 aSig0 = alternateASig0;
5253 aSig1 = alternateASig1;
5254 zSign = ! zSign;
5255 }
5256 return
5257 normalizeRoundAndPackFloatx80(
ff32e16e 5258 80, zSign, bExp + expDiff, aSig0, aSig1, status);
158142c2
FB
5259
5260}
5261
5262/*----------------------------------------------------------------------------
5263| Returns the square root of the extended double-precision floating-point
5264| value `a'. The operation is performed according to the IEC/IEEE Standard
5265| for Binary Floating-Point Arithmetic.
5266*----------------------------------------------------------------------------*/
5267
e5a41ffa 5268floatx80 floatx80_sqrt(floatx80 a, float_status *status)
158142c2
FB
5269{
5270 flag aSign;
f4014512 5271 int32_t aExp, zExp;
bb98fe42
AF
5272 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5273 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2 5274
d1eb8f2a
AD
5275 if (floatx80_invalid_encoding(a)) {
5276 float_raise(float_flag_invalid, status);
5277 return floatx80_default_nan(status);
5278 }
158142c2
FB
5279 aSig0 = extractFloatx80Frac( a );
5280 aExp = extractFloatx80Exp( a );
5281 aSign = extractFloatx80Sign( a );
5282 if ( aExp == 0x7FFF ) {
ff32e16e
PM
5283 if ((uint64_t)(aSig0 << 1)) {
5284 return propagateFloatx80NaN(a, a, status);
5285 }
158142c2
FB
5286 if ( ! aSign ) return a;
5287 goto invalid;
5288 }
5289 if ( aSign ) {
5290 if ( ( aExp | aSig0 ) == 0 ) return a;
5291 invalid:
ff32e16e 5292 float_raise(float_flag_invalid, status);
af39bc8c 5293 return floatx80_default_nan(status);
158142c2
FB
5294 }
5295 if ( aExp == 0 ) {
5296 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5297 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5298 }
5299 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5300 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5301 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5302 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5303 doubleZSig0 = zSig0<<1;
5304 mul64To128( zSig0, zSig0, &term0, &term1 );
5305 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5306 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5307 --zSig0;
5308 doubleZSig0 -= 2;
5309 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5310 }
5311 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5312 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5313 if ( zSig1 == 0 ) zSig1 = 1;
5314 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5315 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5316 mul64To128( zSig1, zSig1, &term2, &term3 );
5317 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5318 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5319 --zSig1;
5320 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5321 term3 |= 1;
5322 term2 |= doubleZSig0;
5323 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5324 }
5325 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5326 }
5327 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5328 zSig0 |= doubleZSig0;
a2f2d288
PM
5329 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5330 0, zExp, zSig0, zSig1, status);
158142c2
FB
5331}
5332
5333/*----------------------------------------------------------------------------
b689362d
AJ
5334| Returns 1 if the extended double-precision floating-point value `a' is equal
5335| to the corresponding value `b', and 0 otherwise. The invalid exception is
5336| raised if either operand is a NaN. Otherwise, the comparison is performed
5337| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5338*----------------------------------------------------------------------------*/
5339
e5a41ffa 5340int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5341{
5342
d1eb8f2a
AD
5343 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5344 || (extractFloatx80Exp(a) == 0x7FFF
5345 && (uint64_t) (extractFloatx80Frac(a) << 1))
5346 || (extractFloatx80Exp(b) == 0x7FFF
5347 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5348 ) {
ff32e16e 5349 float_raise(float_flag_invalid, status);
158142c2
FB
5350 return 0;
5351 }
5352 return
5353 ( a.low == b.low )
5354 && ( ( a.high == b.high )
5355 || ( ( a.low == 0 )
bb98fe42 5356 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5357 );
5358
5359}
5360
5361/*----------------------------------------------------------------------------
5362| Returns 1 if the extended double-precision floating-point value `a' is
5363| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5364| invalid exception is raised if either operand is a NaN. The comparison is
5365| performed according to the IEC/IEEE Standard for Binary Floating-Point
5366| Arithmetic.
158142c2
FB
5367*----------------------------------------------------------------------------*/
5368
e5a41ffa 5369int floatx80_le(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5370{
5371 flag aSign, bSign;
5372
d1eb8f2a
AD
5373 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5374 || (extractFloatx80Exp(a) == 0x7FFF
5375 && (uint64_t) (extractFloatx80Frac(a) << 1))
5376 || (extractFloatx80Exp(b) == 0x7FFF
5377 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5378 ) {
ff32e16e 5379 float_raise(float_flag_invalid, status);
158142c2
FB
5380 return 0;
5381 }
5382 aSign = extractFloatx80Sign( a );
5383 bSign = extractFloatx80Sign( b );
5384 if ( aSign != bSign ) {
5385 return
5386 aSign
bb98fe42 5387 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5388 == 0 );
5389 }
5390 return
5391 aSign ? le128( b.high, b.low, a.high, a.low )
5392 : le128( a.high, a.low, b.high, b.low );
5393
5394}
5395
5396/*----------------------------------------------------------------------------
5397| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5398| less than the corresponding value `b', and 0 otherwise. The invalid
5399| exception is raised if either operand is a NaN. The comparison is performed
5400| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5401*----------------------------------------------------------------------------*/
5402
e5a41ffa 5403int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5404{
5405 flag aSign, bSign;
5406
d1eb8f2a
AD
5407 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5408 || (extractFloatx80Exp(a) == 0x7FFF
5409 && (uint64_t) (extractFloatx80Frac(a) << 1))
5410 || (extractFloatx80Exp(b) == 0x7FFF
5411 && (uint64_t) (extractFloatx80Frac(b) << 1))
158142c2 5412 ) {
ff32e16e 5413 float_raise(float_flag_invalid, status);
158142c2
FB
5414 return 0;
5415 }
5416 aSign = extractFloatx80Sign( a );
5417 bSign = extractFloatx80Sign( b );
5418 if ( aSign != bSign ) {
5419 return
5420 aSign
bb98fe42 5421 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5422 != 0 );
5423 }
5424 return
5425 aSign ? lt128( b.high, b.low, a.high, a.low )
5426 : lt128( a.high, a.low, b.high, b.low );
5427
5428}
5429
67b7861d
AJ
5430/*----------------------------------------------------------------------------
5431| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5432| cannot be compared, and 0 otherwise. The invalid exception is raised if
5433| either operand is a NaN. The comparison is performed according to the
5434| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d 5435*----------------------------------------------------------------------------*/
e5a41ffa 5436int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
67b7861d 5437{
d1eb8f2a
AD
5438 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5439 || (extractFloatx80Exp(a) == 0x7FFF
5440 && (uint64_t) (extractFloatx80Frac(a) << 1))
5441 || (extractFloatx80Exp(b) == 0x7FFF
5442 && (uint64_t) (extractFloatx80Frac(b) << 1))
67b7861d 5443 ) {
ff32e16e 5444 float_raise(float_flag_invalid, status);
67b7861d
AJ
5445 return 1;
5446 }
5447 return 0;
5448}
5449
158142c2 5450/*----------------------------------------------------------------------------
b689362d 5451| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5452| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5453| cause an exception. The comparison is performed according to the IEC/IEEE
5454| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5455*----------------------------------------------------------------------------*/
5456
e5a41ffa 5457int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5458{
5459
d1eb8f2a
AD
5460 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5461 float_raise(float_flag_invalid, status);
5462 return 0;
5463 }
158142c2 5464 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5465 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5466 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5467 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5468 ) {
af39bc8c
AM
5469 if (floatx80_is_signaling_nan(a, status)
5470 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5471 float_raise(float_flag_invalid, status);
b689362d 5472 }
158142c2
FB
5473 return 0;
5474 }
5475 return
5476 ( a.low == b.low )
5477 && ( ( a.high == b.high )
5478 || ( ( a.low == 0 )
bb98fe42 5479 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5480 );
5481
5482}
5483
5484/*----------------------------------------------------------------------------
5485| Returns 1 if the extended double-precision floating-point value `a' is less
5486| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5487| do not cause an exception. Otherwise, the comparison is performed according
5488| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5489*----------------------------------------------------------------------------*/
5490
e5a41ffa 5491int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5492{
5493 flag aSign, bSign;
5494
d1eb8f2a
AD
5495 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5496 float_raise(float_flag_invalid, status);
5497 return 0;
5498 }
158142c2 5499 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5500 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5501 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5502 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5503 ) {
af39bc8c
AM
5504 if (floatx80_is_signaling_nan(a, status)
5505 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5506 float_raise(float_flag_invalid, status);
158142c2
FB
5507 }
5508 return 0;
5509 }
5510 aSign = extractFloatx80Sign( a );
5511 bSign = extractFloatx80Sign( b );
5512 if ( aSign != bSign ) {
5513 return
5514 aSign
bb98fe42 5515 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5516 == 0 );
5517 }
5518 return
5519 aSign ? le128( b.high, b.low, a.high, a.low )
5520 : le128( a.high, a.low, b.high, b.low );
5521
5522}
5523
5524/*----------------------------------------------------------------------------
5525| Returns 1 if the extended double-precision floating-point value `a' is less
5526| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5527| an exception. Otherwise, the comparison is performed according to the
5528| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5529*----------------------------------------------------------------------------*/
5530
e5a41ffa 5531int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
158142c2
FB
5532{
5533 flag aSign, bSign;
5534
d1eb8f2a
AD
5535 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5536 float_raise(float_flag_invalid, status);
5537 return 0;
5538 }
158142c2 5539 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5540 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5541 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5542 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5543 ) {
af39bc8c
AM
5544 if (floatx80_is_signaling_nan(a, status)
5545 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5546 float_raise(float_flag_invalid, status);
158142c2
FB
5547 }
5548 return 0;
5549 }
5550 aSign = extractFloatx80Sign( a );
5551 bSign = extractFloatx80Sign( b );
5552 if ( aSign != bSign ) {
5553 return
5554 aSign
bb98fe42 5555 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5556 != 0 );
5557 }
5558 return
5559 aSign ? lt128( b.high, b.low, a.high, a.low )
5560 : lt128( a.high, a.low, b.high, b.low );
5561
5562}
5563
67b7861d
AJ
5564/*----------------------------------------------------------------------------
5565| Returns 1 if the extended double-precision floating-point values `a' and `b'
5566| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5567| The comparison is performed according to the IEC/IEEE Standard for Binary
5568| Floating-Point Arithmetic.
5569*----------------------------------------------------------------------------*/
e5a41ffa 5570int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
67b7861d 5571{
d1eb8f2a
AD
5572 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5573 float_raise(float_flag_invalid, status);
5574 return 1;
5575 }
67b7861d
AJ
5576 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5577 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5578 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5579 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5580 ) {
af39bc8c
AM
5581 if (floatx80_is_signaling_nan(a, status)
5582 || floatx80_is_signaling_nan(b, status)) {
ff32e16e 5583 float_raise(float_flag_invalid, status);
67b7861d
AJ
5584 }
5585 return 1;
5586 }
5587 return 0;
5588}
5589
158142c2
FB
5590/*----------------------------------------------------------------------------
5591| Returns the result of converting the quadruple-precision floating-point
5592| value `a' to the 32-bit two's complement integer format. The conversion
5593| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5594| Arithmetic---which means in particular that the conversion is rounded
5595| according to the current rounding mode. If `a' is a NaN, the largest
5596| positive integer is returned. Otherwise, if the conversion overflows, the
5597| largest integer with the same sign as `a' is returned.
5598*----------------------------------------------------------------------------*/
5599
f4014512 5600int32_t float128_to_int32(float128 a, float_status *status)
158142c2
FB
5601{
5602 flag aSign;
f4014512 5603 int32_t aExp, shiftCount;
bb98fe42 5604 uint64_t aSig0, aSig1;
158142c2
FB
5605
5606 aSig1 = extractFloat128Frac1( a );
5607 aSig0 = extractFloat128Frac0( a );
5608 aExp = extractFloat128Exp( a );
5609 aSign = extractFloat128Sign( a );
5610 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5611 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5612 aSig0 |= ( aSig1 != 0 );
5613 shiftCount = 0x4028 - aExp;
5614 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
ff32e16e 5615 return roundAndPackInt32(aSign, aSig0, status);
158142c2
FB
5616
5617}
5618
5619/*----------------------------------------------------------------------------
5620| Returns the result of converting the quadruple-precision floating-point
5621| value `a' to the 32-bit two's complement integer format. The conversion
5622| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5623| Arithmetic, except that the conversion is always rounded toward zero. If
5624| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5625| conversion overflows, the largest integer with the same sign as `a' is
5626| returned.
5627*----------------------------------------------------------------------------*/
5628
f4014512 5629int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
158142c2
FB
5630{
5631 flag aSign;
f4014512 5632 int32_t aExp, shiftCount;
bb98fe42 5633 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5634 int32_t z;
158142c2
FB
5635
5636 aSig1 = extractFloat128Frac1( a );
5637 aSig0 = extractFloat128Frac0( a );
5638 aExp = extractFloat128Exp( a );
5639 aSign = extractFloat128Sign( a );
5640 aSig0 |= ( aSig1 != 0 );
5641 if ( 0x401E < aExp ) {
5642 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5643 goto invalid;
5644 }
5645 else if ( aExp < 0x3FFF ) {
a2f2d288
PM
5646 if (aExp || aSig0) {
5647 status->float_exception_flags |= float_flag_inexact;
5648 }
158142c2
FB
5649 return 0;
5650 }
5651 aSig0 |= LIT64( 0x0001000000000000 );
5652 shiftCount = 0x402F - aExp;
5653 savedASig = aSig0;
5654 aSig0 >>= shiftCount;
5655 z = aSig0;
5656 if ( aSign ) z = - z;
5657 if ( ( z < 0 ) ^ aSign ) {
5658 invalid:
ff32e16e 5659 float_raise(float_flag_invalid, status);
bb98fe42 5660 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5661 }
5662 if ( ( aSig0<<shiftCount ) != savedASig ) {
a2f2d288 5663 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5664 }
5665 return z;
5666
5667}
5668
5669/*----------------------------------------------------------------------------
5670| Returns the result of converting the quadruple-precision floating-point
5671| value `a' to the 64-bit two's complement integer format. The conversion
5672| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5673| Arithmetic---which means in particular that the conversion is rounded
5674| according to the current rounding mode. If `a' is a NaN, the largest
5675| positive integer is returned. Otherwise, if the conversion overflows, the
5676| largest integer with the same sign as `a' is returned.
5677*----------------------------------------------------------------------------*/
5678
f42c2224 5679int64_t float128_to_int64(float128 a, float_status *status)
158142c2
FB
5680{
5681 flag aSign;
f4014512 5682 int32_t aExp, shiftCount;
bb98fe42 5683 uint64_t aSig0, aSig1;
158142c2
FB
5684
5685 aSig1 = extractFloat128Frac1( a );
5686 aSig0 = extractFloat128Frac0( a );
5687 aExp = extractFloat128Exp( a );
5688 aSign = extractFloat128Sign( a );
5689 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5690 shiftCount = 0x402F - aExp;
5691 if ( shiftCount <= 0 ) {
5692 if ( 0x403E < aExp ) {
ff32e16e 5693 float_raise(float_flag_invalid, status);
158142c2
FB
5694 if ( ! aSign
5695 || ( ( aExp == 0x7FFF )
5696 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5697 )
5698 ) {
5699 return LIT64( 0x7FFFFFFFFFFFFFFF );
5700 }
bb98fe42 5701 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5702 }
5703 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5704 }
5705 else {
5706 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5707 }
ff32e16e 5708 return roundAndPackInt64(aSign, aSig0, aSig1, status);
158142c2
FB
5709
5710}
5711
5712/*----------------------------------------------------------------------------
5713| Returns the result of converting the quadruple-precision floating-point
5714| value `a' to the 64-bit two's complement integer format. The conversion
5715| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5716| Arithmetic, except that the conversion is always rounded toward zero.
5717| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5718| the conversion overflows, the largest integer with the same sign as `a' is
5719| returned.
5720*----------------------------------------------------------------------------*/
5721
f42c2224 5722int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
158142c2
FB
5723{
5724 flag aSign;
f4014512 5725 int32_t aExp, shiftCount;
bb98fe42 5726 uint64_t aSig0, aSig1;
f42c2224 5727 int64_t z;
158142c2
FB
5728
5729 aSig1 = extractFloat128Frac1( a );
5730 aSig0 = extractFloat128Frac0( a );
5731 aExp = extractFloat128Exp( a );
5732 aSign = extractFloat128Sign( a );
5733 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5734 shiftCount = aExp - 0x402F;
5735 if ( 0 < shiftCount ) {
5736 if ( 0x403E <= aExp ) {
5737 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5738 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5739 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
a2f2d288
PM
5740 if (aSig1) {
5741 status->float_exception_flags |= float_flag_inexact;
5742 }
158142c2
FB
5743 }
5744 else {
ff32e16e 5745 float_raise(float_flag_invalid, status);
158142c2
FB
5746 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5747 return LIT64( 0x7FFFFFFFFFFFFFFF );
5748 }
5749 }
bb98fe42 5750 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5751 }
5752 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5753 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
a2f2d288 5754 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5755 }
5756 }
5757 else {
5758 if ( aExp < 0x3FFF ) {
5759 if ( aExp | aSig0 | aSig1 ) {
a2f2d288 5760 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5761 }
5762 return 0;
5763 }
5764 z = aSig0>>( - shiftCount );
5765 if ( aSig1
bb98fe42 5766 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
a2f2d288 5767 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
5768 }
5769 }
5770 if ( aSign ) z = - z;
5771 return z;
5772
5773}
5774
2e6d8568
BR
5775/*----------------------------------------------------------------------------
5776| Returns the result of converting the quadruple-precision floating-point value
5777| `a' to the 64-bit unsigned integer format. The conversion is
5778| performed according to the IEC/IEEE Standard for Binary Floating-Point
5779| Arithmetic---which means in particular that the conversion is rounded
5780| according to the current rounding mode. If `a' is a NaN, the largest
5781| positive integer is returned. If the conversion overflows, the
5782| largest unsigned integer is returned. If 'a' is negative, the value is
5783| rounded and zero is returned; negative values that do not round to zero
5784| will raise the inexact exception.
5785*----------------------------------------------------------------------------*/
5786
5787uint64_t float128_to_uint64(float128 a, float_status *status)
5788{
5789 flag aSign;
5790 int aExp;
5791 int shiftCount;
5792 uint64_t aSig0, aSig1;
5793
5794 aSig0 = extractFloat128Frac0(a);
5795 aSig1 = extractFloat128Frac1(a);
5796 aExp = extractFloat128Exp(a);
5797 aSign = extractFloat128Sign(a);
5798 if (aSign && (aExp > 0x3FFE)) {
5799 float_raise(float_flag_invalid, status);
5800 if (float128_is_any_nan(a)) {
5801 return LIT64(0xFFFFFFFFFFFFFFFF);
5802 } else {
5803 return 0;
5804 }
5805 }
5806 if (aExp) {
5807 aSig0 |= LIT64(0x0001000000000000);
5808 }
5809 shiftCount = 0x402F - aExp;
5810 if (shiftCount <= 0) {
5811 if (0x403E < aExp) {
5812 float_raise(float_flag_invalid, status);
5813 return LIT64(0xFFFFFFFFFFFFFFFF);
5814 }
5815 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5816 } else {
5817 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5818 }
5819 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5820}
5821
5822uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5823{
5824 uint64_t v;
5825 signed char current_rounding_mode = status->float_rounding_mode;
5826
5827 set_float_rounding_mode(float_round_to_zero, status);
5828 v = float128_to_uint64(a, status);
5829 set_float_rounding_mode(current_rounding_mode, status);
5830
5831 return v;
5832}
5833
158142c2
FB
5834/*----------------------------------------------------------------------------
5835| Returns the result of converting the quadruple-precision floating-point
fd425037
BR
5836| value `a' to the 32-bit unsigned integer format. The conversion
5837| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5838| Arithmetic except that the conversion is always rounded toward zero.
5839| If `a' is a NaN, the largest positive integer is returned. Otherwise,
5840| if the conversion overflows, the largest unsigned integer is returned.
5841| If 'a' is negative, the value is rounded and zero is returned; negative
5842| values that do not round to zero will raise the inexact exception.
5843*----------------------------------------------------------------------------*/
5844
5845uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5846{
5847 uint64_t v;
5848 uint32_t res;
5849 int old_exc_flags = get_float_exception_flags(status);
5850
5851 v = float128_to_uint64_round_to_zero(a, status);
5852 if (v > 0xffffffff) {
5853 res = 0xffffffff;
5854 } else {
5855 return v;
5856 }
5857 set_float_exception_flags(old_exc_flags, status);
5858 float_raise(float_flag_invalid, status);
5859 return res;
5860}
5861
5862/*----------------------------------------------------------------------------
5863| Returns the result of converting the quadruple-precision floating-point
158142c2
FB
5864| value `a' to the single-precision floating-point format. The conversion
5865| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5866| Arithmetic.
5867*----------------------------------------------------------------------------*/
5868
e5a41ffa 5869float32 float128_to_float32(float128 a, float_status *status)
158142c2
FB
5870{
5871 flag aSign;
f4014512 5872 int32_t aExp;
bb98fe42
AF
5873 uint64_t aSig0, aSig1;
5874 uint32_t zSig;
158142c2
FB
5875
5876 aSig1 = extractFloat128Frac1( a );
5877 aSig0 = extractFloat128Frac0( a );
5878 aExp = extractFloat128Exp( a );
5879 aSign = extractFloat128Sign( a );
5880 if ( aExp == 0x7FFF ) {
5881 if ( aSig0 | aSig1 ) {
ff32e16e 5882 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
158142c2
FB
5883 }
5884 return packFloat32( aSign, 0xFF, 0 );
5885 }
5886 aSig0 |= ( aSig1 != 0 );
5887 shift64RightJamming( aSig0, 18, &aSig0 );
5888 zSig = aSig0;
5889 if ( aExp || zSig ) {
5890 zSig |= 0x40000000;
5891 aExp -= 0x3F81;
5892 }
ff32e16e 5893 return roundAndPackFloat32(aSign, aExp, zSig, status);
158142c2
FB
5894
5895}
5896
5897/*----------------------------------------------------------------------------
5898| Returns the result of converting the quadruple-precision floating-point
5899| value `a' to the double-precision floating-point format. The conversion
5900| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5901| Arithmetic.
5902*----------------------------------------------------------------------------*/
5903
e5a41ffa 5904float64 float128_to_float64(float128 a, float_status *status)
158142c2
FB
5905{
5906 flag aSign;
f4014512 5907 int32_t aExp;
bb98fe42 5908 uint64_t aSig0, aSig1;
158142c2
FB
5909
5910 aSig1 = extractFloat128Frac1( a );
5911 aSig0 = extractFloat128Frac0( a );
5912 aExp = extractFloat128Exp( a );
5913 aSign = extractFloat128Sign( a );
5914 if ( aExp == 0x7FFF ) {
5915 if ( aSig0 | aSig1 ) {
ff32e16e 5916 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
158142c2
FB
5917 }
5918 return packFloat64( aSign, 0x7FF, 0 );
5919 }
5920 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5921 aSig0 |= ( aSig1 != 0 );
5922 if ( aExp || aSig0 ) {
5923 aSig0 |= LIT64( 0x4000000000000000 );
5924 aExp -= 0x3C01;
5925 }
ff32e16e 5926 return roundAndPackFloat64(aSign, aExp, aSig0, status);
158142c2
FB
5927
5928}
5929
158142c2
FB
5930/*----------------------------------------------------------------------------
5931| Returns the result of converting the quadruple-precision floating-point
5932| value `a' to the extended double-precision floating-point format. The
5933| conversion is performed according to the IEC/IEEE Standard for Binary
5934| Floating-Point Arithmetic.
5935*----------------------------------------------------------------------------*/
5936
e5a41ffa 5937floatx80 float128_to_floatx80(float128 a, float_status *status)
158142c2
FB
5938{
5939 flag aSign;
f4014512 5940 int32_t aExp;
bb98fe42 5941 uint64_t aSig0, aSig1;
158142c2
FB
5942
5943 aSig1 = extractFloat128Frac1( a );
5944 aSig0 = extractFloat128Frac0( a );
5945 aExp = extractFloat128Exp( a );
5946 aSign = extractFloat128Sign( a );
5947 if ( aExp == 0x7FFF ) {
5948 if ( aSig0 | aSig1 ) {
ff32e16e 5949 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
158142c2 5950 }
0f605c88
LV
5951 return packFloatx80(aSign, floatx80_infinity_high,
5952 floatx80_infinity_low);
158142c2
FB
5953 }
5954 if ( aExp == 0 ) {
5955 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5956 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5957 }
5958 else {
5959 aSig0 |= LIT64( 0x0001000000000000 );
5960 }
5961 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
ff32e16e 5962 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
158142c2
FB
5963
5964}
5965
158142c2
FB
5966/*----------------------------------------------------------------------------
5967| Rounds the quadruple-precision floating-point value `a' to an integer, and
5968| returns the result as a quadruple-precision floating-point value. The
5969| operation is performed according to the IEC/IEEE Standard for Binary
5970| Floating-Point Arithmetic.
5971*----------------------------------------------------------------------------*/
5972
e5a41ffa 5973float128 float128_round_to_int(float128 a, float_status *status)
158142c2
FB
5974{
5975 flag aSign;
f4014512 5976 int32_t aExp;
bb98fe42 5977 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5978 float128 z;
5979
5980 aExp = extractFloat128Exp( a );
5981 if ( 0x402F <= aExp ) {
5982 if ( 0x406F <= aExp ) {
5983 if ( ( aExp == 0x7FFF )
5984 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5985 ) {
ff32e16e 5986 return propagateFloat128NaN(a, a, status);
158142c2
FB
5987 }
5988 return a;
5989 }
5990 lastBitMask = 1;
5991 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5992 roundBitsMask = lastBitMask - 1;
5993 z = a;
a2f2d288 5994 switch (status->float_rounding_mode) {
dc355b76 5995 case float_round_nearest_even:
158142c2
FB
5996 if ( lastBitMask ) {
5997 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5998 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5999 }
6000 else {
bb98fe42 6001 if ( (int64_t) z.low < 0 ) {
158142c2 6002 ++z.high;
bb98fe42 6003 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6004 }
6005 }
dc355b76 6006 break;
f9288a76
PM
6007 case float_round_ties_away:
6008 if (lastBitMask) {
6009 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6010 } else {
6011 if ((int64_t) z.low < 0) {
6012 ++z.high;
6013 }
6014 }
6015 break;
dc355b76
PM
6016 case float_round_to_zero:
6017 break;
6018 case float_round_up:
6019 if (!extractFloat128Sign(z)) {
6020 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6021 }
6022 break;
6023 case float_round_down:
6024 if (extractFloat128Sign(z)) {
6025 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6026 }
dc355b76
PM
6027 break;
6028 default:
6029 abort();
158142c2
FB
6030 }
6031 z.low &= ~ roundBitsMask;
6032 }
6033 else {
6034 if ( aExp < 0x3FFF ) {
bb98fe42 6035 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
a2f2d288 6036 status->float_exception_flags |= float_flag_inexact;
158142c2 6037 aSign = extractFloat128Sign( a );
a2f2d288 6038 switch (status->float_rounding_mode) {
158142c2
FB
6039 case float_round_nearest_even:
6040 if ( ( aExp == 0x3FFE )
6041 && ( extractFloat128Frac0( a )
6042 | extractFloat128Frac1( a ) )
6043 ) {
6044 return packFloat128( aSign, 0x3FFF, 0, 0 );
6045 }
6046 break;
f9288a76
PM
6047 case float_round_ties_away:
6048 if (aExp == 0x3FFE) {
6049 return packFloat128(aSign, 0x3FFF, 0, 0);
6050 }
6051 break;
158142c2
FB
6052 case float_round_down:
6053 return
6054 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6055 : packFloat128( 0, 0, 0, 0 );
6056 case float_round_up:
6057 return
6058 aSign ? packFloat128( 1, 0, 0, 0 )
6059 : packFloat128( 0, 0x3FFF, 0, 0 );
6060 }
6061 return packFloat128( aSign, 0, 0, 0 );
6062 }
6063 lastBitMask = 1;
6064 lastBitMask <<= 0x402F - aExp;
6065 roundBitsMask = lastBitMask - 1;
6066 z.low = 0;
6067 z.high = a.high;
a2f2d288 6068 switch (status->float_rounding_mode) {
dc355b76 6069 case float_round_nearest_even:
158142c2
FB
6070 z.high += lastBitMask>>1;
6071 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6072 z.high &= ~ lastBitMask;
6073 }
dc355b76 6074 break;
f9288a76
PM
6075 case float_round_ties_away:
6076 z.high += lastBitMask>>1;
6077 break;
dc355b76
PM
6078 case float_round_to_zero:
6079 break;
6080 case float_round_up:
6081 if (!extractFloat128Sign(z)) {
158142c2
FB
6082 z.high |= ( a.low != 0 );
6083 z.high += roundBitsMask;
6084 }
dc355b76
PM
6085 break;
6086 case float_round_down:
6087 if (extractFloat128Sign(z)) {
6088 z.high |= (a.low != 0);
6089 z.high += roundBitsMask;
6090 }
6091 break;
6092 default:
6093 abort();
158142c2
FB
6094 }
6095 z.high &= ~ roundBitsMask;
6096 }
6097 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
a2f2d288 6098 status->float_exception_flags |= float_flag_inexact;
158142c2
FB
6099 }
6100 return z;
6101
6102}
6103
6104/*----------------------------------------------------------------------------
6105| Returns the result of adding the absolute values of the quadruple-precision
6106| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6107| before being returned. `zSign' is ignored if the result is a NaN.
6108| The addition is performed according to the IEC/IEEE Standard for Binary
6109| Floating-Point Arithmetic.
6110*----------------------------------------------------------------------------*/
6111
e5a41ffa
PM
6112static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6113 float_status *status)
158142c2 6114{
f4014512 6115 int32_t aExp, bExp, zExp;
bb98fe42 6116 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
f4014512 6117 int32_t expDiff;
158142c2
FB
6118
6119 aSig1 = extractFloat128Frac1( a );
6120 aSig0 = extractFloat128Frac0( a );
6121 aExp = extractFloat128Exp( a );
6122 bSig1 = extractFloat128Frac1( b );
6123 bSig0 = extractFloat128Frac0( b );
6124 bExp = extractFloat128Exp( b );
6125 expDiff = aExp - bExp;
6126 if ( 0 < expDiff ) {
6127 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6128 if (aSig0 | aSig1) {
6129 return propagateFloat128NaN(a, b, status);
6130 }
158142c2
FB
6131 return a;
6132 }
6133 if ( bExp == 0 ) {
6134 --expDiff;
6135 }
6136 else {
6137 bSig0 |= LIT64( 0x0001000000000000 );
6138 }
6139 shift128ExtraRightJamming(
6140 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6141 zExp = aExp;
6142 }
6143 else if ( expDiff < 0 ) {
6144 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6145 if (bSig0 | bSig1) {
6146 return propagateFloat128NaN(a, b, status);
6147 }
158142c2
FB
6148 return packFloat128( zSign, 0x7FFF, 0, 0 );
6149 }
6150 if ( aExp == 0 ) {
6151 ++expDiff;
6152 }
6153 else {
6154 aSig0 |= LIT64( 0x0001000000000000 );
6155 }
6156 shift128ExtraRightJamming(
6157 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6158 zExp = bExp;
6159 }
6160 else {
6161 if ( aExp == 0x7FFF ) {
6162 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6163 return propagateFloat128NaN(a, b, status);
158142c2
FB
6164 }
6165 return a;
6166 }
6167 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6168 if ( aExp == 0 ) {
a2f2d288 6169 if (status->flush_to_zero) {
e6afc87f 6170 if (zSig0 | zSig1) {
ff32e16e 6171 float_raise(float_flag_output_denormal, status);
e6afc87f
PM
6172 }
6173 return packFloat128(zSign, 0, 0, 0);
6174 }
fe76d976
PB
6175 return packFloat128( zSign, 0, zSig0, zSig1 );
6176 }
158142c2
FB
6177 zSig2 = 0;
6178 zSig0 |= LIT64( 0x0002000000000000 );
6179 zExp = aExp;
6180 goto shiftRight1;
6181 }
6182 aSig0 |= LIT64( 0x0001000000000000 );
6183 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6184 --zExp;
6185 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6186 ++zExp;
6187 shiftRight1:
6188 shift128ExtraRightJamming(
6189 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6190 roundAndPack:
ff32e16e 6191 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6192
6193}
6194
6195/*----------------------------------------------------------------------------
6196| Returns the result of subtracting the absolute values of the quadruple-
6197| precision floating-point values `a' and `b'. If `zSign' is 1, the
6198| difference is negated before being returned. `zSign' is ignored if the
6199| result is a NaN. The subtraction is performed according to the IEC/IEEE
6200| Standard for Binary Floating-Point Arithmetic.
6201*----------------------------------------------------------------------------*/
6202
e5a41ffa
PM
6203static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6204 float_status *status)
158142c2 6205{
f4014512 6206 int32_t aExp, bExp, zExp;
bb98fe42 6207 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
f4014512 6208 int32_t expDiff;
158142c2
FB
6209
6210 aSig1 = extractFloat128Frac1( a );
6211 aSig0 = extractFloat128Frac0( a );
6212 aExp = extractFloat128Exp( a );
6213 bSig1 = extractFloat128Frac1( b );
6214 bSig0 = extractFloat128Frac0( b );
6215 bExp = extractFloat128Exp( b );
6216 expDiff = aExp - bExp;
6217 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6218 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6219 if ( 0 < expDiff ) goto aExpBigger;
6220 if ( expDiff < 0 ) goto bExpBigger;
6221 if ( aExp == 0x7FFF ) {
6222 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
ff32e16e 6223 return propagateFloat128NaN(a, b, status);
158142c2 6224 }
ff32e16e 6225 float_raise(float_flag_invalid, status);
af39bc8c 6226 return float128_default_nan(status);
158142c2
FB
6227 }
6228 if ( aExp == 0 ) {
6229 aExp = 1;
6230 bExp = 1;
6231 }
6232 if ( bSig0 < aSig0 ) goto aBigger;
6233 if ( aSig0 < bSig0 ) goto bBigger;
6234 if ( bSig1 < aSig1 ) goto aBigger;
6235 if ( aSig1 < bSig1 ) goto bBigger;
a2f2d288
PM
6236 return packFloat128(status->float_rounding_mode == float_round_down,
6237 0, 0, 0);
158142c2
FB
6238 bExpBigger:
6239 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6240 if (bSig0 | bSig1) {
6241 return propagateFloat128NaN(a, b, status);
6242 }
158142c2
FB
6243 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6244 }
6245 if ( aExp == 0 ) {
6246 ++expDiff;
6247 }
6248 else {
6249 aSig0 |= LIT64( 0x4000000000000000 );
6250 }
6251 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6252 bSig0 |= LIT64( 0x4000000000000000 );
6253 bBigger:
6254 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6255 zExp = bExp;
6256 zSign ^= 1;
6257 goto normalizeRoundAndPack;
6258 aExpBigger:
6259 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6260 if (aSig0 | aSig1) {
6261 return propagateFloat128NaN(a, b, status);
6262 }
158142c2
FB
6263 return a;
6264 }
6265 if ( bExp == 0 ) {
6266 --expDiff;
6267 }
6268 else {
6269 bSig0 |= LIT64( 0x4000000000000000 );
6270 }
6271 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6272 aSig0 |= LIT64( 0x4000000000000000 );
6273 aBigger:
6274 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6275 zExp = aExp;
6276 normalizeRoundAndPack:
6277 --zExp;
ff32e16e
PM
6278 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6279 status);
158142c2
FB
6280
6281}
6282
6283/*----------------------------------------------------------------------------
6284| Returns the result of adding the quadruple-precision floating-point values
6285| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6286| for Binary Floating-Point Arithmetic.
6287*----------------------------------------------------------------------------*/
6288
e5a41ffa 6289float128 float128_add(float128 a, float128 b, float_status *status)
158142c2
FB
6290{
6291 flag aSign, bSign;
6292
6293 aSign = extractFloat128Sign( a );
6294 bSign = extractFloat128Sign( b );
6295 if ( aSign == bSign ) {
ff32e16e 6296 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6297 }
6298 else {
ff32e16e 6299 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6300 }
6301
6302}
6303
6304/*----------------------------------------------------------------------------
6305| Returns the result of subtracting the quadruple-precision floating-point
6306| values `a' and `b'. The operation is performed according to the IEC/IEEE
6307| Standard for Binary Floating-Point Arithmetic.
6308*----------------------------------------------------------------------------*/
6309
e5a41ffa 6310float128 float128_sub(float128 a, float128 b, float_status *status)
158142c2
FB
6311{
6312 flag aSign, bSign;
6313
6314 aSign = extractFloat128Sign( a );
6315 bSign = extractFloat128Sign( b );
6316 if ( aSign == bSign ) {
ff32e16e 6317 return subFloat128Sigs(a, b, aSign, status);
158142c2
FB
6318 }
6319 else {
ff32e16e 6320 return addFloat128Sigs(a, b, aSign, status);
158142c2
FB
6321 }
6322
6323}
6324
6325/*----------------------------------------------------------------------------
6326| Returns the result of multiplying the quadruple-precision floating-point
6327| values `a' and `b'. The operation is performed according to the IEC/IEEE
6328| Standard for Binary Floating-Point Arithmetic.
6329*----------------------------------------------------------------------------*/
6330
e5a41ffa 6331float128 float128_mul(float128 a, float128 b, float_status *status)
158142c2
FB
6332{
6333 flag aSign, bSign, zSign;
f4014512 6334 int32_t aExp, bExp, zExp;
bb98fe42 6335 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6336
6337 aSig1 = extractFloat128Frac1( a );
6338 aSig0 = extractFloat128Frac0( a );
6339 aExp = extractFloat128Exp( a );
6340 aSign = extractFloat128Sign( a );
6341 bSig1 = extractFloat128Frac1( b );
6342 bSig0 = extractFloat128Frac0( b );
6343 bExp = extractFloat128Exp( b );
6344 bSign = extractFloat128Sign( b );
6345 zSign = aSign ^ bSign;
6346 if ( aExp == 0x7FFF ) {
6347 if ( ( aSig0 | aSig1 )
6348 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6349 return propagateFloat128NaN(a, b, status);
158142c2
FB
6350 }
6351 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6352 return packFloat128( zSign, 0x7FFF, 0, 0 );
6353 }
6354 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6355 if (bSig0 | bSig1) {
6356 return propagateFloat128NaN(a, b, status);
6357 }
158142c2
FB
6358 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6359 invalid:
ff32e16e 6360 float_raise(float_flag_invalid, status);
af39bc8c 6361 return float128_default_nan(status);
158142c2
FB
6362 }
6363 return packFloat128( zSign, 0x7FFF, 0, 0 );
6364 }
6365 if ( aExp == 0 ) {
6366 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6367 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6368 }
6369 if ( bExp == 0 ) {
6370 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6371 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6372 }
6373 zExp = aExp + bExp - 0x4000;
6374 aSig0 |= LIT64( 0x0001000000000000 );
6375 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6376 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6377 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6378 zSig2 |= ( zSig3 != 0 );
6379 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6380 shift128ExtraRightJamming(
6381 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6382 ++zExp;
6383 }
ff32e16e 6384 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6385
6386}
6387
6388/*----------------------------------------------------------------------------
6389| Returns the result of dividing the quadruple-precision floating-point value
6390| `a' by the corresponding value `b'. The operation is performed according to
6391| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6392*----------------------------------------------------------------------------*/
6393
e5a41ffa 6394float128 float128_div(float128 a, float128 b, float_status *status)
158142c2
FB
6395{
6396 flag aSign, bSign, zSign;
f4014512 6397 int32_t aExp, bExp, zExp;
bb98fe42
AF
6398 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6399 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6400
6401 aSig1 = extractFloat128Frac1( a );
6402 aSig0 = extractFloat128Frac0( a );
6403 aExp = extractFloat128Exp( a );
6404 aSign = extractFloat128Sign( a );
6405 bSig1 = extractFloat128Frac1( b );
6406 bSig0 = extractFloat128Frac0( b );
6407 bExp = extractFloat128Exp( b );
6408 bSign = extractFloat128Sign( b );
6409 zSign = aSign ^ bSign;
6410 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6411 if (aSig0 | aSig1) {
6412 return propagateFloat128NaN(a, b, status);
6413 }
158142c2 6414 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6415 if (bSig0 | bSig1) {
6416 return propagateFloat128NaN(a, b, status);
6417 }
158142c2
FB
6418 goto invalid;
6419 }
6420 return packFloat128( zSign, 0x7FFF, 0, 0 );
6421 }
6422 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6423 if (bSig0 | bSig1) {
6424 return propagateFloat128NaN(a, b, status);
6425 }
158142c2
FB
6426 return packFloat128( zSign, 0, 0, 0 );
6427 }
6428 if ( bExp == 0 ) {
6429 if ( ( bSig0 | bSig1 ) == 0 ) {
6430 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6431 invalid:
ff32e16e 6432 float_raise(float_flag_invalid, status);
af39bc8c 6433 return float128_default_nan(status);
158142c2 6434 }
ff32e16e 6435 float_raise(float_flag_divbyzero, status);
158142c2
FB
6436 return packFloat128( zSign, 0x7FFF, 0, 0 );
6437 }
6438 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6439 }
6440 if ( aExp == 0 ) {
6441 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6442 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6443 }
6444 zExp = aExp - bExp + 0x3FFD;
6445 shortShift128Left(
6446 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6447 shortShift128Left(
6448 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6449 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6450 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6451 ++zExp;
6452 }
6453 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6454 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6455 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6456 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6457 --zSig0;
6458 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6459 }
6460 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6461 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6462 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6463 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6464 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6465 --zSig1;
6466 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6467 }
6468 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6469 }
6470 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
ff32e16e 6471 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6472
6473}
6474
6475/*----------------------------------------------------------------------------
6476| Returns the remainder of the quadruple-precision floating-point value `a'
6477| with respect to the corresponding value `b'. The operation is performed
6478| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6479*----------------------------------------------------------------------------*/
6480
e5a41ffa 6481float128 float128_rem(float128 a, float128 b, float_status *status)
158142c2 6482{
ed086f3d 6483 flag aSign, zSign;
f4014512 6484 int32_t aExp, bExp, expDiff;
bb98fe42
AF
6485 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6486 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6487 int64_t sigMean0;
158142c2
FB
6488
6489 aSig1 = extractFloat128Frac1( a );
6490 aSig0 = extractFloat128Frac0( a );
6491 aExp = extractFloat128Exp( a );
6492 aSign = extractFloat128Sign( a );
6493 bSig1 = extractFloat128Frac1( b );
6494 bSig0 = extractFloat128Frac0( b );
6495 bExp = extractFloat128Exp( b );
158142c2
FB
6496 if ( aExp == 0x7FFF ) {
6497 if ( ( aSig0 | aSig1 )
6498 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
ff32e16e 6499 return propagateFloat128NaN(a, b, status);
158142c2
FB
6500 }
6501 goto invalid;
6502 }
6503 if ( bExp == 0x7FFF ) {
ff32e16e
PM
6504 if (bSig0 | bSig1) {
6505 return propagateFloat128NaN(a, b, status);
6506 }
158142c2
FB
6507 return a;
6508 }
6509 if ( bExp == 0 ) {
6510 if ( ( bSig0 | bSig1 ) == 0 ) {
6511 invalid:
ff32e16e 6512 float_raise(float_flag_invalid, status);
af39bc8c 6513 return float128_default_nan(status);
158142c2
FB
6514 }
6515 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6516 }
6517 if ( aExp == 0 ) {
6518 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6519 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6520 }
6521 expDiff = aExp - bExp;
6522 if ( expDiff < -1 ) return a;
6523 shortShift128Left(
6524 aSig0 | LIT64( 0x0001000000000000 ),
6525 aSig1,
6526 15 - ( expDiff < 0 ),
6527 &aSig0,
6528 &aSig1
6529 );
6530 shortShift128Left(
6531 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6532 q = le128( bSig0, bSig1, aSig0, aSig1 );
6533 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6534 expDiff -= 64;
6535 while ( 0 < expDiff ) {
6536 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6537 q = ( 4 < q ) ? q - 4 : 0;
6538 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6539 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6540 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6541 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6542 expDiff -= 61;
6543 }
6544 if ( -64 < expDiff ) {
6545 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6546 q = ( 4 < q ) ? q - 4 : 0;
6547 q >>= - expDiff;
6548 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6549 expDiff += 52;
6550 if ( expDiff < 0 ) {
6551 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6552 }
6553 else {
6554 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6555 }
6556 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6557 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6558 }
6559 else {
6560 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6561 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6562 }
6563 do {
6564 alternateASig0 = aSig0;
6565 alternateASig1 = aSig1;
6566 ++q;
6567 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6568 } while ( 0 <= (int64_t) aSig0 );
158142c2 6569 add128(
bb98fe42 6570 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6571 if ( ( sigMean0 < 0 )
6572 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6573 aSig0 = alternateASig0;
6574 aSig1 = alternateASig1;
6575 }
bb98fe42 6576 zSign = ( (int64_t) aSig0 < 0 );
158142c2 6577 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
ff32e16e
PM
6578 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6579 status);
158142c2
FB
6580}
6581
6582/*----------------------------------------------------------------------------
6583| Returns the square root of the quadruple-precision floating-point value `a'.
6584| The operation is performed according to the IEC/IEEE Standard for Binary
6585| Floating-Point Arithmetic.
6586*----------------------------------------------------------------------------*/
6587
e5a41ffa 6588float128 float128_sqrt(float128 a, float_status *status)
158142c2
FB
6589{
6590 flag aSign;
f4014512 6591 int32_t aExp, zExp;
bb98fe42
AF
6592 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6593 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6594
6595 aSig1 = extractFloat128Frac1( a );
6596 aSig0 = extractFloat128Frac0( a );
6597 aExp = extractFloat128Exp( a );
6598 aSign = extractFloat128Sign( a );
6599 if ( aExp == 0x7FFF ) {
ff32e16e
PM
6600 if (aSig0 | aSig1) {
6601 return propagateFloat128NaN(a, a, status);
6602 }
158142c2
FB
6603 if ( ! aSign ) return a;
6604 goto invalid;
6605 }
6606 if ( aSign ) {
6607 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6608 invalid:
ff32e16e 6609 float_raise(float_flag_invalid, status);
af39bc8c 6610 return float128_default_nan(status);
158142c2
FB
6611 }
6612 if ( aExp == 0 ) {
6613 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6614 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6615 }
6616 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6617 aSig0 |= LIT64( 0x0001000000000000 );
6618 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6619 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6620 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6621 doubleZSig0 = zSig0<<1;
6622 mul64To128( zSig0, zSig0, &term0, &term1 );
6623 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6624 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6625 --zSig0;
6626 doubleZSig0 -= 2;
6627 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6628 }
6629 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6630 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6631 if ( zSig1 == 0 ) zSig1 = 1;
6632 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6633 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6634 mul64To128( zSig1, zSig1, &term2, &term3 );
6635 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6636 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6637 --zSig1;
6638 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6639 term3 |= 1;
6640 term2 |= doubleZSig0;
6641 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6642 }
6643 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6644 }
6645 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
ff32e16e 6646 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
158142c2
FB
6647
6648}
6649
6650/*----------------------------------------------------------------------------
6651| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6652| the corresponding value `b', and 0 otherwise. The invalid exception is
6653| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6654| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6655*----------------------------------------------------------------------------*/
6656
e5a41ffa 6657int float128_eq(float128 a, float128 b, float_status *status)
158142c2
FB
6658{
6659
6660 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6661 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6662 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6663 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6664 ) {
ff32e16e 6665 float_raise(float_flag_invalid, status);
158142c2
FB
6666 return 0;
6667 }
6668 return
6669 ( a.low == b.low )
6670 && ( ( a.high == b.high )
6671 || ( ( a.low == 0 )
bb98fe42 6672 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6673 );
6674
6675}
6676
6677/*----------------------------------------------------------------------------
6678| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6679| or equal to the corresponding value `b', and 0 otherwise. The invalid
6680| exception is raised if either operand is a NaN. The comparison is performed
6681| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6682*----------------------------------------------------------------------------*/
6683
e5a41ffa 6684int float128_le(float128 a, float128 b, float_status *status)
158142c2
FB
6685{
6686 flag aSign, bSign;
6687
6688 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6689 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6690 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6691 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6692 ) {
ff32e16e 6693 float_raise(float_flag_invalid, status);
158142c2
FB
6694 return 0;
6695 }
6696 aSign = extractFloat128Sign( a );
6697 bSign = extractFloat128Sign( b );
6698 if ( aSign != bSign ) {
6699 return
6700 aSign
bb98fe42 6701 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6702 == 0 );
6703 }
6704 return
6705 aSign ? le128( b.high, b.low, a.high, a.low )
6706 : le128( a.high, a.low, b.high, b.low );
6707
6708}
6709
6710/*----------------------------------------------------------------------------
6711| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6712| the corresponding value `b', and 0 otherwise. The invalid exception is
6713| raised if either operand is a NaN. The comparison is performed according
6714| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6715*----------------------------------------------------------------------------*/
6716
e5a41ffa 6717int float128_lt(float128 a, float128 b, float_status *status)
158142c2
FB
6718{
6719 flag aSign, bSign;
6720
6721 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6722 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6723 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6724 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6725 ) {
ff32e16e 6726 float_raise(float_flag_invalid, status);
158142c2
FB
6727 return 0;
6728 }
6729 aSign = extractFloat128Sign( a );
6730 bSign = extractFloat128Sign( b );
6731 if ( aSign != bSign ) {
6732 return
6733 aSign
bb98fe42 6734 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6735 != 0 );
6736 }
6737 return
6738 aSign ? lt128( b.high, b.low, a.high, a.low )
6739 : lt128( a.high, a.low, b.high, b.low );
6740
6741}
6742
67b7861d
AJ
6743/*----------------------------------------------------------------------------
6744| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6745| be compared, and 0 otherwise. The invalid exception is raised if either
6746| operand is a NaN. The comparison is performed according to the IEC/IEEE
6747| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6748*----------------------------------------------------------------------------*/
6749
e5a41ffa 6750int float128_unordered(float128 a, float128 b, float_status *status)
67b7861d
AJ
6751{
6752 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6753 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6754 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6755 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6756 ) {
ff32e16e 6757 float_raise(float_flag_invalid, status);
67b7861d
AJ
6758 return 1;
6759 }
6760 return 0;
6761}
6762
158142c2
FB
6763/*----------------------------------------------------------------------------
6764| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6765| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6766| exception. The comparison is performed according to the IEC/IEEE Standard
6767| for Binary Floating-Point Arithmetic.
158142c2
FB
6768*----------------------------------------------------------------------------*/
6769
e5a41ffa 6770int float128_eq_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6771{
6772
6773 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6774 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6775 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6776 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6777 ) {
af39bc8c
AM
6778 if (float128_is_signaling_nan(a, status)
6779 || float128_is_signaling_nan(b, status)) {
ff32e16e 6780 float_raise(float_flag_invalid, status);
b689362d 6781 }
158142c2
FB
6782 return 0;
6783 }
6784 return
6785 ( a.low == b.low )
6786 && ( ( a.high == b.high )
6787 || ( ( a.low == 0 )
bb98fe42 6788 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6789 );
6790
6791}
6792
6793/*----------------------------------------------------------------------------
6794| Returns 1 if the quadruple-precision floating-point value `a' is less than
6795| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6796| cause an exception. Otherwise, the comparison is performed according to the
6797| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6798*----------------------------------------------------------------------------*/
6799
e5a41ffa 6800int float128_le_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6801{
6802 flag aSign, bSign;
6803
6804 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6805 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6806 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6807 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6808 ) {
af39bc8c
AM
6809 if (float128_is_signaling_nan(a, status)
6810 || float128_is_signaling_nan(b, status)) {
ff32e16e 6811 float_raise(float_flag_invalid, status);
158142c2
FB
6812 }
6813 return 0;
6814 }
6815 aSign = extractFloat128Sign( a );
6816 bSign = extractFloat128Sign( b );
6817 if ( aSign != bSign ) {
6818 return
6819 aSign
bb98fe42 6820 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6821 == 0 );
6822 }
6823 return
6824 aSign ? le128( b.high, b.low, a.high, a.low )
6825 : le128( a.high, a.low, b.high, b.low );
6826
6827}
6828
6829/*----------------------------------------------------------------------------
6830| Returns 1 if the quadruple-precision floating-point value `a' is less than
6831| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6832| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6833| Standard for Binary Floating-Point Arithmetic.
6834*----------------------------------------------------------------------------*/
6835
e5a41ffa 6836int float128_lt_quiet(float128 a, float128 b, float_status *status)
158142c2
FB
6837{
6838 flag aSign, bSign;
6839
6840 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6841 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6842 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6843 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6844 ) {
af39bc8c
AM
6845 if (float128_is_signaling_nan(a, status)
6846 || float128_is_signaling_nan(b, status)) {
ff32e16e 6847 float_raise(float_flag_invalid, status);
158142c2
FB
6848 }
6849 return 0;
6850 }
6851 aSign = extractFloat128Sign( a );
6852 bSign = extractFloat128Sign( b );
6853 if ( aSign != bSign ) {
6854 return
6855 aSign
bb98fe42 6856 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6857 != 0 );
6858 }
6859 return
6860 aSign ? lt128( b.high, b.low, a.high, a.low )
6861 : lt128( a.high, a.low, b.high, b.low );
6862
6863}
6864
67b7861d
AJ
6865/*----------------------------------------------------------------------------
6866| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6867| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6868| comparison is performed according to the IEC/IEEE Standard for Binary
6869| Floating-Point Arithmetic.
6870*----------------------------------------------------------------------------*/
6871
e5a41ffa 6872int float128_unordered_quiet(float128 a, float128 b, float_status *status)
67b7861d
AJ
6873{
6874 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6875 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6876 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6877 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6878 ) {
af39bc8c
AM
6879 if (float128_is_signaling_nan(a, status)
6880 || float128_is_signaling_nan(b, status)) {
ff32e16e 6881 float_raise(float_flag_invalid, status);
67b7861d
AJ
6882 }
6883 return 1;
6884 }
6885 return 0;
6886}
6887
e5a41ffa
PM
6888static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6889 int is_quiet, float_status *status)
f6714d36
AJ
6890{
6891 flag aSign, bSign;
6892
d1eb8f2a
AD
6893 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6894 float_raise(float_flag_invalid, status);
6895 return float_relation_unordered;
6896 }
f6714d36
AJ
6897 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6898 ( extractFloatx80Frac( a )<<1 ) ) ||
6899 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6900 ( extractFloatx80Frac( b )<<1 ) )) {
6901 if (!is_quiet ||
af39bc8c
AM
6902 floatx80_is_signaling_nan(a, status) ||
6903 floatx80_is_signaling_nan(b, status)) {
ff32e16e 6904 float_raise(float_flag_invalid, status);
f6714d36
AJ
6905 }
6906 return float_relation_unordered;
6907 }
6908 aSign = extractFloatx80Sign( a );
6909 bSign = extractFloatx80Sign( b );
6910 if ( aSign != bSign ) {
6911
6912 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6913 ( ( a.low | b.low ) == 0 ) ) {
6914 /* zero case */
6915 return float_relation_equal;
6916 } else {
6917 return 1 - (2 * aSign);
6918 }
6919 } else {
6920 if (a.low == b.low && a.high == b.high) {
6921 return float_relation_equal;
6922 } else {
6923 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6924 }
6925 }
6926}
6927
e5a41ffa 6928int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
f6714d36 6929{
ff32e16e 6930 return floatx80_compare_internal(a, b, 0, status);
f6714d36
AJ
6931}
6932
e5a41ffa 6933int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
f6714d36 6934{
ff32e16e 6935 return floatx80_compare_internal(a, b, 1, status);
f6714d36
AJ
6936}
6937
e5a41ffa
PM
6938static inline int float128_compare_internal(float128 a, float128 b,
6939 int is_quiet, float_status *status)
1f587329
BS
6940{
6941 flag aSign, bSign;
6942
6943 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6944 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6945 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6946 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6947 if (!is_quiet ||
af39bc8c
AM
6948 float128_is_signaling_nan(a, status) ||
6949 float128_is_signaling_nan(b, status)) {
ff32e16e 6950 float_raise(float_flag_invalid, status);
1f587329
BS
6951 }
6952 return float_relation_unordered;
6953 }
6954 aSign = extractFloat128Sign( a );
6955 bSign = extractFloat128Sign( b );
6956 if ( aSign != bSign ) {
6957 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6958 /* zero case */
6959 return float_relation_equal;
6960 } else {
6961 return 1 - (2 * aSign);
6962 }
6963 } else {
6964 if (a.low == b.low && a.high == b.high) {
6965 return float_relation_equal;
6966 } else {
6967 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6968 }
6969 }
6970}
6971
e5a41ffa 6972int float128_compare(float128 a, float128 b, float_status *status)
1f587329 6973{
ff32e16e 6974 return float128_compare_internal(a, b, 0, status);
1f587329
BS
6975}
6976
e5a41ffa 6977int float128_compare_quiet(float128 a, float128 b, float_status *status)
1f587329 6978{
ff32e16e 6979 return float128_compare_internal(a, b, 1, status);
1f587329
BS
6980}
6981
e5a41ffa 6982floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
9ee6e8bb
PB
6983{
6984 flag aSign;
326b9e98 6985 int32_t aExp;
bb98fe42 6986 uint64_t aSig;
9ee6e8bb 6987
d1eb8f2a
AD
6988 if (floatx80_invalid_encoding(a)) {
6989 float_raise(float_flag_invalid, status);
6990 return floatx80_default_nan(status);
6991 }
9ee6e8bb
PB
6992 aSig = extractFloatx80Frac( a );
6993 aExp = extractFloatx80Exp( a );
6994 aSign = extractFloatx80Sign( a );
6995
326b9e98
AJ
6996 if ( aExp == 0x7FFF ) {
6997 if ( aSig<<1 ) {
ff32e16e 6998 return propagateFloatx80NaN(a, a, status);
326b9e98 6999 }
9ee6e8bb
PB
7000 return a;
7001 }
326b9e98 7002
3c85c37f
PM
7003 if (aExp == 0) {
7004 if (aSig == 0) {
7005 return a;
7006 }
7007 aExp++;
7008 }
69397542 7009
326b9e98
AJ
7010 if (n > 0x10000) {
7011 n = 0x10000;
7012 } else if (n < -0x10000) {
7013 n = -0x10000;
7014 }
7015
9ee6e8bb 7016 aExp += n;
a2f2d288
PM
7017 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7018 aSign, aExp, aSig, 0, status);
9ee6e8bb 7019}
9ee6e8bb 7020
e5a41ffa 7021float128 float128_scalbn(float128 a, int n, float_status *status)
9ee6e8bb
PB
7022{
7023 flag aSign;
326b9e98 7024 int32_t aExp;
bb98fe42 7025 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7026
7027 aSig1 = extractFloat128Frac1( a );
7028 aSig0 = extractFloat128Frac0( a );
7029 aExp = extractFloat128Exp( a );
7030 aSign = extractFloat128Sign( a );
7031 if ( aExp == 0x7FFF ) {
326b9e98 7032 if ( aSig0 | aSig1 ) {
ff32e16e 7033 return propagateFloat128NaN(a, a, status);
326b9e98 7034 }
9ee6e8bb
PB
7035 return a;
7036 }
3c85c37f 7037 if (aExp != 0) {
69397542 7038 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7039 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7040 return a;
3c85c37f
PM
7041 } else {
7042 aExp++;
7043 }
69397542 7044
326b9e98
AJ
7045 if (n > 0x10000) {
7046 n = 0x10000;
7047 } else if (n < -0x10000) {
7048 n = -0x10000;
7049 }
7050
69397542
PB
7051 aExp += n - 1;
7052 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
ff32e16e 7053 , status);
9ee6e8bb
PB
7054
7055}
This page took 1.73716 seconds and 4 git commands to generate.